diff options
Diffstat (limited to 'llvm')
177 files changed, 19311 insertions, 11373 deletions
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst index 732227b..2dc3d77 100644 --- a/llvm/docs/CodingStandards.rst +++ b/llvm/docs/CodingStandards.rst @@ -1594,20 +1594,25 @@ Restrict Visibility ^^^^^^^^^^^^^^^^^^^ Functions and variables should have the most restricted visibility possible. + For class members, that means using appropriate ``private``, ``protected``, or -``public`` keyword to restrict their access. For non-member functions, variables, -and classes, that means restricting visibility to a single ``.cpp`` file if it's -not referenced outside that file. +``public`` keyword to restrict their access. + +For non-member functions, variables, and classes, that means restricting +visibility to a single ``.cpp`` file if it is not referenced outside that file. Visibility of file-scope non-member variables and functions can be restricted to the current translation unit by using either the ``static`` keyword or an anonymous -namespace. Anonymous namespaces are a great language feature that tells the C++ +namespace. + +Anonymous namespaces are a great language feature that tells the C++ compiler that the contents of the namespace are only visible within the current translation unit, allowing more aggressive optimization and eliminating the -possibility of symbol name collisions. Anonymous namespaces are to C++ as -``static`` is to C functions and global variables. While ``static`` is available -in C++, anonymous namespaces are more general: they can make entire classes -private to a file. +possibility of symbol name collisions. + +Anonymous namespaces are to C++ as ``static`` is to C functions and global +variables. While ``static`` is available in C++, anonymous namespaces are more +general: they can make entire classes private to a file. The problem with anonymous namespaces is that they naturally want to encourage indentation of their body, and they reduce locality of reference: if you see a @@ -1653,10 +1658,17 @@ Avoid putting declarations other than classes into anonymous namespaces: } // namespace -When you are looking at "``runHelper``" in the middle of a large C++ file, -you have no immediate way to tell if this function is local to the file. In -contrast, when the function is marked static, you don't need to cross-reference -faraway places in the file to tell that the function is local. +When you are looking at ``runHelper`` in the middle of a large C++ file, +you have no immediate way to tell if this function is local to the file. + +In contrast, when the function is marked static, you don't need to cross-reference +faraway places in the file to tell that the function is local: + +.. code-block:: c++ + + static void runHelper() { + ... + } Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index d87457c..498c19b 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -153,6 +153,7 @@ class Vocabulary { static_cast<unsigned>(OperandKind::MaxOperandKind), "OperandKindNames array size must match MaxOperandKind"); +public: /// Vocabulary layout constants #define LAST_OTHER_INST(NUM) static constexpr unsigned MaxOpcodes = NUM; #include "llvm/IR/Instruction.def" @@ -162,7 +163,6 @@ class Vocabulary { static constexpr unsigned MaxOperandKinds = static_cast<unsigned>(OperandKind::MaxOperandKind); -public: Vocabulary() = default; Vocabulary(VocabVector &&Vocab); diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h index aefdb53..d500e94 100644 --- a/llvm/include/llvm/CodeGen/CommandFlags.h +++ b/llvm/include/llvm/CodeGen/CommandFlags.h @@ -133,6 +133,8 @@ LLVM_ABI bool getEnableStackSizeSection(); LLVM_ABI bool getEnableAddrsig(); +LLVM_ABI bool getEnableCallGraphSection(); + LLVM_ABI bool getEmitCallSiteInfo(); LLVM_ABI bool getEnableMachineFunctionSplitter(); diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index 119786f..0f3f945 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -482,6 +482,8 @@ struct CallSiteInfo { MachineInstrLoc CallLocation; std::vector<ArgRegPair> ArgForwardingRegs; + /// Numeric callee type identifiers for the callgraph section. + std::vector<uint64_t> CalleeTypeIds; bool operator==(const CallSiteInfo &Other) const { return CallLocation.BlockNum == Other.CallLocation.BlockNum && @@ -511,6 +513,7 @@ template <> struct MappingTraits<CallSiteInfo> { YamlIO.mapRequired("offset", CSInfo.CallLocation.Offset); YamlIO.mapOptional("fwdArgRegs", CSInfo.ArgForwardingRegs, std::vector<CallSiteInfo::ArgRegPair>()); + YamlIO.mapOptional("calleeTypeIds", CSInfo.CalleeTypeIds); } static const bool flow = true; diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index e5958ec..7f88323 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -515,6 +515,8 @@ public: struct CallSiteInfo { /// Vector of call argument and its forwarding register. SmallVector<ArgRegPair, 1> ArgRegPairs; + /// Callee type ids. + SmallVector<ConstantInt *, 4> CalleeTypeIds; }; struct CalledGlobalInfo { diff --git a/llvm/include/llvm/Demangle/DemangleConfig.h b/llvm/include/llvm/Demangle/DemangleConfig.h index 7ee23a4..912c9b8 100644 --- a/llvm/include/llvm/Demangle/DemangleConfig.h +++ b/llvm/include/llvm/Demangle/DemangleConfig.h @@ -15,6 +15,9 @@ #ifndef LLVM_DEMANGLE_DEMANGLECONFIG_H #define LLVM_DEMANGLE_DEMANGLECONFIG_H +// llvm-config.h is required for LLVM_ENABLE_LLVM_EXPORT_ANNOTATIONS +#include "llvm/Config/llvm-config.h" + #ifndef __has_feature #define __has_feature(x) 0 #endif diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h index af252aa..33203ad 100644 --- a/llvm/include/llvm/IR/Metadata.h +++ b/llvm/include/llvm/IR/Metadata.h @@ -759,18 +759,18 @@ public: /// memory access used by the alias-analysis infrastructure. struct AAMDNodes { explicit AAMDNodes() = default; - explicit AAMDNodes(MDNode *T, MDNode *TS, MDNode *S, MDNode *N) - : TBAA(T), TBAAStruct(TS), Scope(S), NoAlias(N) {} + explicit AAMDNodes(MDNode *T, MDNode *TS, MDNode *S, MDNode *N, MDNode *NAS) + : TBAA(T), TBAAStruct(TS), Scope(S), NoAlias(N), NoAliasAddrSpace(NAS) {} bool operator==(const AAMDNodes &A) const { return TBAA == A.TBAA && TBAAStruct == A.TBAAStruct && Scope == A.Scope && - NoAlias == A.NoAlias; + NoAlias == A.NoAlias && NoAliasAddrSpace == A.NoAliasAddrSpace; } bool operator!=(const AAMDNodes &A) const { return !(*this == A); } explicit operator bool() const { - return TBAA || TBAAStruct || Scope || NoAlias; + return TBAA || TBAAStruct || Scope || NoAlias || NoAliasAddrSpace; } /// The tag for type-based alias analysis. @@ -785,6 +785,9 @@ struct AAMDNodes { /// The tag specifying the noalias scope. MDNode *NoAlias = nullptr; + /// The tag specifying the noalias address spaces. + MDNode *NoAliasAddrSpace = nullptr; + // Shift tbaa Metadata node to start off bytes later LLVM_ABI static MDNode *shiftTBAA(MDNode *M, size_t off); @@ -806,6 +809,8 @@ struct AAMDNodes { Result.TBAAStruct = Other.TBAAStruct == TBAAStruct ? TBAAStruct : nullptr; Result.Scope = Other.Scope == Scope ? Scope : nullptr; Result.NoAlias = Other.NoAlias == NoAlias ? NoAlias : nullptr; + Result.NoAliasAddrSpace = + Other.NoAliasAddrSpace == NoAliasAddrSpace ? NoAliasAddrSpace : nullptr; return Result; } @@ -818,6 +823,7 @@ struct AAMDNodes { TBAAStruct ? shiftTBAAStruct(TBAAStruct, Offset) : nullptr; Result.Scope = Scope; Result.NoAlias = NoAlias; + Result.NoAliasAddrSpace = NoAliasAddrSpace; return Result; } @@ -833,6 +839,7 @@ struct AAMDNodes { Result.TBAAStruct = TBAAStruct; Result.Scope = Scope; Result.NoAlias = NoAlias; + Result.NoAliasAddrSpace = NoAliasAddrSpace; return Result; } @@ -860,12 +867,12 @@ struct AAMDNodes { template<> struct DenseMapInfo<AAMDNodes> { static inline AAMDNodes getEmptyKey() { - return AAMDNodes(DenseMapInfo<MDNode *>::getEmptyKey(), - nullptr, nullptr, nullptr); + return AAMDNodes(DenseMapInfo<MDNode *>::getEmptyKey(), nullptr, nullptr, + nullptr, nullptr); } static inline AAMDNodes getTombstoneKey() { - return AAMDNodes(DenseMapInfo<MDNode *>::getTombstoneKey(), + return AAMDNodes(DenseMapInfo<MDNode *>::getTombstoneKey(), nullptr, nullptr, nullptr, nullptr); } @@ -873,7 +880,8 @@ struct DenseMapInfo<AAMDNodes> { return DenseMapInfo<MDNode *>::getHashValue(Val.TBAA) ^ DenseMapInfo<MDNode *>::getHashValue(Val.TBAAStruct) ^ DenseMapInfo<MDNode *>::getHashValue(Val.Scope) ^ - DenseMapInfo<MDNode *>::getHashValue(Val.NoAlias); + DenseMapInfo<MDNode *>::getHashValue(Val.NoAlias) ^ + DenseMapInfo<MDNode *>::getHashValue(Val.NoAliasAddrSpace); } static bool isEqual(const AAMDNodes &LHS, const AAMDNodes &RHS) { diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h index bbbee15..4b43a8f 100644 --- a/llvm/include/llvm/MC/MCObjectStreamer.h +++ b/llvm/include/llvm/MC/MCObjectStreamer.h @@ -87,6 +87,7 @@ public: // Add a new fragment to the current section without a variable-size tail. void newFragment(); + void appendContents(ArrayRef<char> Contents); void appendContents(size_t Num, char Elt); void addFixup(const MCExpr *Value, MCFixupKind Kind); diff --git a/llvm/include/llvm/Support/Debug.h b/llvm/include/llvm/Support/Debug.h index 924d7b2..5542089 100644 --- a/llvm/include/llvm/Support/Debug.h +++ b/llvm/include/llvm/Support/Debug.h @@ -39,13 +39,19 @@ class raw_ostream; /// isCurrentDebugType - Return true if the specified string is the debug type /// specified on the command line, or if none was specified on the command line /// with the -debug-only=X option. -/// -bool isCurrentDebugType(const char *Type); +/// An optional level can be provided to control the verbosity of the output. +/// If the provided level is not 0 and user specified a level below the provided +/// level, return false. +bool isCurrentDebugType(const char *Type, int Level = 0); /// setCurrentDebugType - Set the current debug type, as if the -debug-only=X /// option were specified. Note that DebugFlag also needs to be set to true for /// debug output to be produced. -/// +/// The debug type format is "type[:level]", where the level is an optional +/// integer. If a level is provided, the debug output is enabled only if the +/// user specified a level at least as high as the provided level. +/// 0 is a special level that acts as an opt-out for this specific debug type +/// without affecting the other debug output. void setCurrentDebugType(const char *Type); /// setCurrentDebugTypes - Set the current debug type, as if the diff --git a/llvm/include/llvm/Support/DebugLog.h b/llvm/include/llvm/Support/DebugLog.h index 19d3098..8fca2d5 100644 --- a/llvm/include/llvm/Support/DebugLog.h +++ b/llvm/include/llvm/Support/DebugLog.h @@ -19,29 +19,64 @@ namespace llvm { #ifndef NDEBUG -// Output with given inputs and trailing newline. E.g., +// LDBG() is a macro that can be used as a raw_ostream for debugging. +// It will stream the output to the dbgs() stream, with a prefix of the +// debug type and the file and line number. A trailing newline is added to the +// output automatically. If the streamed content contains a newline, the prefix +// is added to each beginning of a new line. Nothing is printed if the debug +// output is not enabled or the debug type does not match. +// +// E.g., // LDBG() << "Bitset contains: " << Bitset; -// is equivalent to -// LLVM_DEBUG(dbgs() << DEBUG_TYPE << " [" << __FILE__ << ":" << __LINE__ -// << "] " << "Bitset contains: " << Bitset << "\n"); -#define LDBG() DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), DEBUG_TYPE) - -#define DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, TYPE, FILE) \ - for (bool _c = (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)); _c; \ - _c = false) \ +// is somehow equivalent to +// LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] " << __FILE__ << ":" << +// __LINE__ << " " +// << "Bitset contains: " << Bitset << "\n"); +// +// An optional `level` argument can be provided to control the verbosity of the +// output. The default level is 1, and is in increasing level of verbosity. +// +// The `level` argument can be a literal integer, or a macro that evaluates to +// an integer. +// +#define LDBG(...) _GET_LDBG_MACRO(__VA_ARGS__)(__VA_ARGS__) + +// Helper macros to choose the correct macro based on the number of arguments. +#define LDBG_FUNC_CHOOSER(_f1, _f2, ...) _f2 +#define LDBG_FUNC_RECOMPOSER(argsWithParentheses) \ + LDBG_FUNC_CHOOSER argsWithParentheses +#define LDBG_CHOOSE_FROM_ARG_COUNT(...) \ + LDBG_FUNC_RECOMPOSER((__VA_ARGS__, LDBG_LOG_LEVEL, )) +#define LDBG_NO_ARG_EXPANDER() , LDBG_LOG_LEVEL_1 +#define _GET_LDBG_MACRO(...) \ + LDBG_CHOOSE_FROM_ARG_COUNT(LDBG_NO_ARG_EXPANDER __VA_ARGS__()) + +// Dispatch macros to support the `level` argument or none (default to 1) +#define LDBG_LOG_LEVEL(LEVEL) \ + DEBUGLOG_WITH_STREAM_AND_TYPE(llvm::dbgs(), LEVEL, DEBUG_TYPE) +#define LDBG_LOG_LEVEL_1() LDBG_LOG_LEVEL(1) + +#define DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE, \ + LINE) \ + for (bool _c = \ + (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE, LEVEL)); \ + _c; _c = false) \ ::llvm::impl::raw_ldbg_ostream{ \ - ::llvm::impl::computePrefix(TYPE, FILE, __LINE__), (STREAM)} \ + ::llvm::impl::computePrefix(TYPE, FILE, LINE, LEVEL), (STREAM)} \ .asLvalue() + +#define DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, FILE) \ + DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(STREAM, LEVEL, TYPE, FILE, __LINE__) // When __SHORT_FILE__ is not defined, the File is the full path, // otherwise __SHORT_FILE__ is defined in CMake to provide the file name // without the path prefix. #if defined(__SHORT_FILE__) -#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, TYPE) \ - DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, TYPE, __SHORT_FILE__) +#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE) \ + DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, __SHORT_FILE__) #else -#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, TYPE) \ - DEBUGLOG_WITH_STREAM_TYPE_AND_FILE( \ - STREAM, TYPE, ::llvm::impl::LogWithNewline::getShortFileName(__FILE__)) +#define DEBUGLOG_WITH_STREAM_AND_TYPE(STREAM, LEVEL, TYPE) \ + DEBUGLOG_WITH_STREAM_TYPE_AND_FILE(STREAM, LEVEL, TYPE, \ + ::llvm::impl::getShortFileName(__FILE__)) #endif namespace impl { @@ -119,11 +154,11 @@ getShortFileName(const char *path) { /// "[DebugType] File:Line " /// Where the File is the file name without the path prefix. static LLVM_ATTRIBUTE_UNUSED std::string -computePrefix(const char *DebugType, const char *File, int Line) { +computePrefix(const char *DebugType, const char *File, int Line, int Level) { std::string Prefix; raw_string_ostream OsPrefix(Prefix); if (DebugType) - OsPrefix << "[" << DebugType << "] "; + OsPrefix << "[" << DebugType << ":" << Level << "] "; OsPrefix << File << ":" << Line << " "; return OsPrefix.str(); } @@ -131,7 +166,7 @@ computePrefix(const char *DebugType, const char *File, int Line) { #else // As others in Debug, When compiling without assertions, the -debug-* options // and all inputs too LDBG() are ignored. -#define LDBG() \ +#define LDBG(...) \ for (bool _c = false; _c; _c = false) \ ::llvm::nulls() #endif diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h index f420798..db90f2e 100644 --- a/llvm/include/llvm/Target/TargetOptions.h +++ b/llvm/include/llvm/Target/TargetOptions.h @@ -133,10 +133,11 @@ public: EmitStackSizeSection(false), EnableMachineOutliner(false), EnableMachineFunctionSplitter(false), EnableStaticDataPartitioning(false), SupportsDefaultOutlining(false), - EmitAddrsig(false), BBAddrMap(false), EmitCallSiteInfo(false), - SupportsDebugEntryValues(false), EnableDebugEntryValues(false), - ValueTrackingVariableLocations(false), ForceDwarfFrameSection(false), - XRayFunctionIndex(true), DebugStrictDwarf(false), Hotpatch(false), + EmitAddrsig(false), BBAddrMap(false), EmitCallGraphSection(false), + EmitCallSiteInfo(false), SupportsDebugEntryValues(false), + EnableDebugEntryValues(false), ValueTrackingVariableLocations(false), + ForceDwarfFrameSection(false), XRayFunctionIndex(true), + DebugStrictDwarf(false), Hotpatch(false), PPCGenScalarMASSEntries(false), JMCInstrument(false), EnableCFIFixup(false), MisExpect(false), XCOFFReadOnlyPointers(false), VerifyArgABICompliance(true), @@ -319,6 +320,9 @@ public: /// to selectively generate basic block sections. std::shared_ptr<MemoryBuffer> BBSectionsFuncListBuf; + /// Emit section containing call graph metadata. + unsigned EmitCallGraphSection : 1; + /// The flag enables call site info production. It is used only for debug /// info, and it is restricted only to optimized code. This can be used for /// something else, so that should be controlled in the frontend. diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 8ec8697..a4ed62b 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1147,6 +1147,9 @@ def fadd_contract : PatFrag<(ops node:$a, node:$b), (fadd node:$a, node:$b),[{ return N->getFlags().hasAllowContract(); }]>; +def fsub_contract : PatFrag<(ops node:$a, node:$b), (fsub node:$a, node:$b),[{ + return N->getFlags().hasAllowContract(); +}]>; def not : PatFrag<(ops node:$in), (xor node:$in, -1)>; def vnot : PatFrag<(ops node:$in), (xor node:$in, immAllOnesV)>; diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 670a632..ede9797 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -199,7 +199,8 @@ public: SUSE, OpenEmbedded, Intel, - LastVendorType = Intel + Meta, + LastVendorType = Meta }; enum OSType { UnknownOS, @@ -307,8 +308,8 @@ public: Mlibc, PAuthTest, - - LastEnvironmentType = PAuthTest + MTIA, + LastEnvironmentType = MTIA }; enum ObjectFormatType { UnknownObjectFormat, diff --git a/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h index 20850ba..a9a370b 100644 --- a/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h +++ b/llvm/include/llvm/Transforms/HipStdPar/HipStdPar.h @@ -41,6 +41,13 @@ public: static bool isRequired() { return true; } }; +class HipStdParMathFixupPass : public PassInfoMixin<HipStdParMathFixupPass> { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); + + static bool isRequired() { return true; } +}; + } // namespace llvm #endif // LLVM_TRANSFORMS_HIPSTDPAR_HIPSTDPAR_H diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp index c871070..7025b83 100644 --- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp +++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp @@ -525,6 +525,8 @@ AAMDNodes AAMDNodes::merge(const AAMDNodes &Other) const { Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } @@ -533,6 +535,8 @@ AAMDNodes AAMDNodes::concat(const AAMDNodes &Other) const { Result.TBAA = Result.TBAAStruct = nullptr; Result.Scope = MDNode::getMostGenericAliasScope(Scope, Other.Scope); Result.NoAlias = MDNode::intersect(NoAlias, Other.NoAlias); + Result.NoAliasAddrSpace = MDNode::getMostGenericNoaliasAddrspace( + NoAliasAddrSpace, Other.NoAliasAddrSpace); return Result; } diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 3b3e7a4..a7c99b1 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -2083,22 +2083,54 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { if (TBB == FBB) { MBB->splice(Loc, TBB, TBB->begin(), TIB); } else { + // Merge the debug locations, and hoist and kill the debug instructions from + // both branches. FIXME: We could probably try harder to preserve some debug + // instructions (but at least this isn't producing wrong locations). + MachineInstrBuilder MIRBuilder(*MBB->getParent(), Loc); + auto HoistAndKillDbgInstr = [MBB, Loc](MachineBasicBlock::iterator DI) { + assert(DI->isDebugInstr() && "Expected a debug instruction"); + if (DI->isDebugRef()) { + const TargetInstrInfo *TII = + MBB->getParent()->getSubtarget().getInstrInfo(); + const MCInstrDesc &DBGV = TII->get(TargetOpcode::DBG_VALUE); + DI = BuildMI(*MBB->getParent(), DI->getDebugLoc(), DBGV, false, 0, + DI->getDebugVariable(), DI->getDebugExpression()); + MBB->insert(Loc, &*DI); + return; + } + // Deleting a DBG_PHI results in an undef at the referenced DBG_INSTR_REF. + if (DI->isDebugPHI()) { + DI->eraseFromParent(); + return; + } + + DI->setDebugValueUndef(); + DI->moveBefore(&*Loc); + }; + // TIB and FIB point to the end of the regions to hoist/merge in TBB and // FBB. MachineBasicBlock::iterator FE = FIB; MachineBasicBlock::iterator FI = FBB->begin(); for (MachineBasicBlock::iterator TI : make_early_inc_range(make_range(TBB->begin(), TIB))) { - // Move debug instructions and pseudo probes without modifying them. - // FIXME: This is the wrong thing to do for debug locations, which - // should at least be killed (and hoisted from BOTH blocks). - if (TI->isDebugOrPseudoInstr()) { - TI->moveBefore(&*Loc); + // Hoist and kill debug instructions from FBB. After this loop FI points + // to the next non-debug instruction to hoist (checked in assert after the + // TBB debug instruction handling code). + while (FI != FE && FI->isDebugInstr()) + HoistAndKillDbgInstr(FI++); + + // Kill debug instructions before moving. + if (TI->isDebugInstr()) { + HoistAndKillDbgInstr(TI); continue; } - // Get the next non-meta instruction in FBB. - FI = skipDebugInstructionsForward(FI, FE, false); + // FI and TI now point to identical non-debug instructions. + assert(FI != FE && "Unexpected end of FBB range"); + // Pseudo probes are excluded from the range when identifying foldable + // instructions, so we don't expect to see one now. + assert(!TI->isPseudoProbe() && "Unexpected pseudo probe in range"); // NOTE: The loop above checks CheckKillDead but we can't do that here as // it modifies some kill markers after the check. assert(TI->isIdenticalTo(*FI, MachineInstr::CheckDefs) && @@ -2111,6 +2143,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) { ++FI; } } + FBB->erase(FBB->begin(), FIB); if (UpdateLiveIns) diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 9512f79..810dc29 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -101,6 +101,7 @@ CGOPT(EABI, EABIVersion) CGOPT(DebuggerKind, DebuggerTuningOpt) CGOPT(bool, EnableStackSizeSection) CGOPT(bool, EnableAddrsig) +CGOPT(bool, EnableCallGraphSection) CGOPT(bool, EmitCallSiteInfo) CGOPT(bool, EnableMachineFunctionSplitter) CGOPT(bool, EnableStaticDataPartitioning) @@ -461,6 +462,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { cl::init(false)); CGBINDOPT(EnableAddrsig); + static cl::opt<bool> EnableCallGraphSection( + "call-graph-section", cl::desc("Emit a call graph section"), + cl::init(false)); + CGBINDOPT(EnableCallGraphSection); + static cl::opt<bool> EmitCallSiteInfo( "emit-call-site-info", cl::desc( @@ -595,6 +601,7 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) { Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter(); Options.EnableStaticDataPartitioning = getEnableStaticDataPartitioning(); Options.EmitAddrsig = getEnableAddrsig(); + Options.EmitCallGraphSection = getEnableCallGraphSection(); Options.EmitCallSiteInfo = getEmitCallSiteInfo(); Options.EnableDebugEntryValues = getEnableDebugEntryValues(); Options.ForceDwarfFrameSection = getForceDwarfFrameSection(); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index e8f513a..e84ba91 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, const TargetOptions &Options = MF->getTarget().Options; LLT DstType = MRI.getType(MI.getOperand(0).getReg()); - if (CanReassociate && - !(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc))) + if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc)) return false; // Floating-point multiply-add with intermediate rounding. @@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI, if (!HasFMAD && !HasFMA) return false; - AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD; + AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD; // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract)) return false; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index ed7b07f..538a763 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) { if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly. return UnableToLegalize; - if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) { + if (MI.getFlag(MachineInstr::FmAfn)) { unsigned Flags = MI.getFlags(); auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags); MIRBuilder.buildFPTrunc(Dst, Src32, Flags); diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 7153902..193df1f 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -616,6 +616,7 @@ static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) { .Case("!range", MIToken::md_range) .Case("!DIExpression", MIToken::md_diexpr) .Case("!DILocation", MIToken::md_dilocation) + .Case("!noalias.addrspace", MIToken::md_noalias_addrspace) .Default(MIToken::Error); } diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h index d7cd067..54142ac 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -151,6 +151,7 @@ struct MIToken { md_tbaa, md_alias_scope, md_noalias, + md_noalias_addrspace, md_range, md_diexpr, md_dilocation, diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 3a364d5..807d59c 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -3482,6 +3482,11 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { if (parseMDNode(AAInfo.NoAlias)) return true; break; + case MIToken::md_noalias_addrspace: + lex(); + if (parseMDNode(AAInfo.NoAliasAddrSpace)) + return true; + break; case MIToken::md_range: lex(); if (parseMDNode(Range)) @@ -3490,7 +3495,7 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) { // TODO: Report an error on duplicate metadata nodes. default: return error("expected 'align' or '!tbaa' or '!alias.scope' or " - "'!noalias' or '!range'"); + "'!noalias' or '!range' or '!noalias.addrspace'"); } } if (expectAndConsume(MIToken::rparen)) diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 1e9fcf3..3e99e57 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -504,13 +504,21 @@ bool MIRParserImpl::initializeCallSiteInfo( return error(Error, ArgRegPair.Reg.SourceRange); CSInfo.ArgRegPairs.emplace_back(Reg, ArgRegPair.ArgNo); } + if (!YamlCSInfo.CalleeTypeIds.empty()) { + for (auto CalleeTypeId : YamlCSInfo.CalleeTypeIds) { + IntegerType *Int64Ty = Type::getInt64Ty(Context); + CSInfo.CalleeTypeIds.push_back(ConstantInt::get(Int64Ty, CalleeTypeId, + /*isSigned=*/false)); + } + } - if (TM.Options.EmitCallSiteInfo) + if (TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection) MF.addCallSiteInfo(&*CallI, std::move(CSInfo)); } - if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo) - return error(Twine("Call site info provided but not used")); + if (!YamlMF.CallSitesInfo.empty() && + !(TM.Options.EmitCallSiteInfo || TM.Options.EmitCallGraphSection)) + return error("call site info provided but not used"); return false; } diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index bc4e299..ad7835a 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -525,24 +525,30 @@ static void convertCallSiteObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST) { const auto *TRI = MF.getSubtarget().getRegisterInfo(); - for (auto CSInfo : MF.getCallSitesInfo()) { + for (auto [MI, CallSiteInfo] : MF.getCallSitesInfo()) { yaml::CallSiteInfo YmlCS; yaml::MachineInstrLoc CallLocation; // Prepare instruction position. - MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator(); + MachineBasicBlock::const_instr_iterator CallI = MI->getIterator(); CallLocation.BlockNum = CallI->getParent()->getNumber(); // Get call instruction offset from the beginning of block. CallLocation.Offset = std::distance(CallI->getParent()->instr_begin(), CallI); YmlCS.CallLocation = CallLocation; + + auto [ArgRegPairs, CalleeTypeIds] = CallSiteInfo; // Construct call arguments and theirs forwarding register info. - for (auto ArgReg : CSInfo.second.ArgRegPairs) { + for (auto ArgReg : ArgRegPairs) { yaml::CallSiteInfo::ArgRegPair YmlArgReg; YmlArgReg.ArgNo = ArgReg.ArgNo; printRegMIR(ArgReg.Reg, YmlArgReg.Reg, TRI); YmlCS.ArgForwardingRegs.emplace_back(YmlArgReg); } + // Get type ids. + for (auto *CalleeTypeId : CalleeTypeIds) { + YmlCS.CalleeTypeIds.push_back(CalleeTypeId->getZExtValue()); + } YMF.CallSitesInfo.push_back(std::move(YmlCS)); } diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 429a17a..60d42e0 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -211,8 +211,7 @@ void MachineFunction::init() { ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); - // FIXME: Use Function::hasOptSize(). - if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.getAlign() && !F.hasOptSize()) Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); @@ -920,7 +919,7 @@ MachineFunction::getCallSiteInfo(const MachineInstr *MI) { assert(MI->isCandidateForAdditionalCallInfo() && "Call site info refers only to call (MI) candidates"); - if (!Target.Options.EmitCallSiteInfo) + if (!Target.Options.EmitCallSiteInfo && !Target.Options.EmitCallGraphSection) return CallSitesInfo.end(); return CallSitesInfo.find(MI); } diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 0d25169..c612f8de 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1273,6 +1273,10 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST, OS << ", !noalias "; AAInfo.NoAlias->printAsOperand(OS, MST); } + if (AAInfo.NoAliasAddrSpace) { + OS << ", !noalias.addrspace "; + AAInfo.NoAliasAddrSpace->printAsOperand(OS, MST); + } if (getRanges()) { OS << ", !range "; getRanges()->printAsOperand(OS, MST); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 74172b2..ba0ab23 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3853,7 +3853,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { break; case ISD::FP_TO_FP16: LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n"); - if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) { + if (Node->getFlags().hasApproximateFuncs() && !TLI.useSoftFloat()) { SDValue Op = Node->getOperand(0); MVT SVT = Op.getSimpleValueType(); if ((SVT == MVT::f64 || SVT == MVT::f80) && diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 1636465..6eca7b7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3923,11 +3923,15 @@ void SelectionDAGBuilder::visitFPTrunc(const User &I) { // FPTrunc is never a no-op cast, no need to check SDValue N = getValue(I.getOperand(0)); SDLoc dl = getCurSDLoc(); + SDNodeFlags Flags; + if (auto *TruncInst = dyn_cast<FPMathOperator>(&I)) + Flags.copyFMF(*TruncInst); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N, DAG.getTargetConstant( - 0, dl, TLI.getPointerTy(DAG.getDataLayout())))); + 0, dl, TLI.getPointerTy(DAG.getDataLayout())), + Flags)); } void SelectionDAGBuilder::visitFPExt(const User &I) { diff --git a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp index 6267207..fd54190 100644 --- a/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp +++ b/llvm/lib/CodeGen/WindowsSecureHotPatching.cpp @@ -369,6 +369,19 @@ static GlobalVariable *getOrCreateRefVariable( AddrOfOldGV, Twine("__ref_").concat(GV->getName()), nullptr, GlobalVariable::NotThreadLocal); + // RefGV is created with isConstant = false, but we want to place RefGV into + // .rdata, not .data. It is important that the GlobalVariable be mutable + // from the compiler's point of view, so that the optimizer does not remove + // the global variable entirely and replace all references to it with its + // initial value. + // + // When the Windows hot-patch loader applies a hot-patch, it maps the + // pages of .rdata as read/write so that it can set each __ref_* variable + // to point to the original variable in the base image. Afterward, pages in + // .rdata are remapped as read-only. This protects the __ref_* variables from + // being overwritten during execution. + RefGV->setSection(".rdata"); + // Create debug info for the replacement global variable. DataLayout Layout = M->getDataLayout(); DIType *DebugType = DebugInfo.createPointerType( diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 0dbd07f..1157cbe 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1796,6 +1796,7 @@ AAMDNodes Instruction::getAAMetadata() const { Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct); Result.Scope = Info.lookup(LLVMContext::MD_alias_scope); Result.NoAlias = Info.lookup(LLVMContext::MD_noalias); + Result.NoAliasAddrSpace = Info.lookup(LLVMContext::MD_noalias_addrspace); } return Result; } @@ -1805,6 +1806,7 @@ void Instruction::setAAMetadata(const AAMDNodes &N) { setMetadata(LLVMContext::MD_tbaa_struct, N.TBAAStruct); setMetadata(LLVMContext::MD_alias_scope, N.Scope); setMetadata(LLVMContext::MD_noalias, N.NoAlias); + setMetadata(LLVMContext::MD_noalias_addrspace, N.NoAliasAddrSpace); } void Instruction::setNoSanitizeMetadata() { diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index 1074669..a214513 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -484,7 +484,8 @@ void MCMachOStreamer::finalizeCGProfile() { // For each entry, reserve space for 2 32-bit indices and a 64-bit count. size_t SectionBytes = W.getCGProfile().size() * (2 * sizeof(uint32_t) + sizeof(uint64_t)); - (*CGProfileSection->begin()).appendContents(SectionBytes, 0); + (*CGProfileSection->begin()) + .setVarContents(std::vector<char>(SectionBytes, 0)); } MCStreamer *llvm::createMachOStreamer(MCContext &Context, @@ -520,5 +521,6 @@ void MCMachOStreamer::createAddrSigSection() { // (instead of emitting a zero-sized section) so these relocations are // technically valid, even though we don't expect these relocations to // actually be applied by the linker. - Frag->appendContents(8, 0); + constexpr char zero[8] = {}; + Frag->setVarContents(zero); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 9c7b05b..e277143 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -57,6 +57,10 @@ void MCObjectStreamer::insert(MCFragment *F) { newFragment(); } +void MCObjectStreamer::appendContents(ArrayRef<char> Contents) { + CurFrag->appendContents(Contents); +} + void MCObjectStreamer::appendContents(size_t Num, char Elt) { CurFrag->appendContents(Num, Elt); } @@ -538,8 +542,7 @@ void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) { void MCObjectStreamer::emitBytes(StringRef Data) { MCDwarfLineEntry::make(this, getCurrentSectionOnly()); - MCFragment *DF = getCurrentFragment(); - DF->appendContents(ArrayRef(Data.data(), Data.size())); + appendContents(ArrayRef(Data.data(), Data.size())); } void MCObjectStreamer::emitValueToAlignment(Align Alignment, int64_t Fill, diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 7b5c3c0..e87696a 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -806,7 +806,7 @@ uint64_t MachObjectWriter::writeObject() { } MCSection *Sec = getContext().getMachOSection("__LLVM", "__cg_profile", 0, SectionKind::getMetadata()); - llvm::copy(OS.str(), Sec->curFragList()->Head->getContents().data()); + llvm::copy(OS.str(), Sec->curFragList()->Head->getVarContents().data()); } unsigned NumSections = Asm.end() - Asm.begin(); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index fd89583..1b111dc 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -84,6 +84,7 @@ MODULE_PASS("global-merge-func", GlobalMergeFuncPass()) MODULE_PASS("globalopt", GlobalOptPass()) MODULE_PASS("globalsplit", GlobalSplitPass()) MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass()) +MODULE_PASS("hipstdpar-math-fixup", HipStdParMathFixupPass()) MODULE_PASS("hipstdpar-select-accelerator-code", HipStdParAcceleratorCodeSelectionPass()) MODULE_PASS("hotcoldsplit", HotColdSplittingPass()) diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp index 5bb04d0..b6f338f 100644 --- a/llvm/lib/Support/Debug.cpp +++ b/llvm/lib/Support/Debug.cpp @@ -24,11 +24,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/Debug.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Signals.h" #include "llvm/Support/circular_raw_ostream.h" #include "llvm/Support/raw_ostream.h" +#include <utility> #include "DebugOptions.h" @@ -38,27 +40,62 @@ using namespace llvm; +/// Parse a debug type string into a pair of the debug type and the debug level. +/// The expected format is "type[:level]", where the level is an optional +/// integer. +static std::pair<std::string, std::optional<int>> +parseDebugType(StringRef DbgType) { + std::optional<int> Level; + size_t ColonPos = DbgType.find(':'); + if (ColonPos != StringRef::npos) { + StringRef LevelStr = DbgType.substr(ColonPos + 1); + DbgType = DbgType.take_front(ColonPos); + if (LevelStr.empty()) + Level = 0; + else { + int parsedLevel; + if (to_integer(LevelStr, parsedLevel, 10)) + Level = parsedLevel; + } + } + return std::make_pair(DbgType.str(), Level); +} + // Even though LLVM might be built with NDEBUG, define symbols that the code // built without NDEBUG can depend on via the llvm/Support/Debug.h header. namespace llvm { /// Exported boolean set by the -debug option. bool DebugFlag = false; -static ManagedStatic<std::vector<std::string>> CurrentDebugType; +/// The current debug type and an optional debug level. +/// The debug level is the verbosity of the debug output. +/// 0 is a special level that acts as an opt-out for this specific debug type. +/// If provided, the debug output is enabled only if the user specified a level +/// at least as high as the provided level. +static ManagedStatic<std::vector<std::pair<std::string, std::optional<int>>>> + CurrentDebugType; /// Return true if the specified string is the debug type /// specified on the command line, or if none was specified on the command line /// with the -debug-only=X option. -bool isCurrentDebugType(const char *DebugType) { +bool isCurrentDebugType(const char *DebugType, int Level) { if (CurrentDebugType->empty()) return true; + // Track if there is at least one debug type with a level, this is used + // to allow to opt-out of some DebugType and leaving all the others enabled. + bool HasEnabledDebugType = false; // See if DebugType is in list. Note: do not use find() as that forces us to // unnecessarily create an std::string instance. - for (auto &d : *CurrentDebugType) { - if (d == DebugType) + for (auto &D : *CurrentDebugType) { + HasEnabledDebugType = + HasEnabledDebugType || (!D.second.has_value() || D.second.value() > 0); + if (D.first != DebugType) + continue; + if (!D.second.has_value()) return true; + return D.second >= Level; } - return false; + return !HasEnabledDebugType; } /// Set the current debug type, as if the -debug-only=X @@ -73,8 +110,11 @@ void setCurrentDebugType(const char *Type) { void setCurrentDebugTypes(const char **Types, unsigned Count) { CurrentDebugType->clear(); - llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count)); + CurrentDebugType->reserve(Count); + for (const char *Type : ArrayRef(Types, Count)) + CurrentDebugType->push_back(parseDebugType(Type)); } + } // namespace llvm // All Debug.h functionality is a no-op in NDEBUG mode. @@ -114,10 +154,10 @@ struct DebugOnlyOpt { if (Val.empty()) return; DebugFlag = true; - SmallVector<StringRef,8> dbgTypes; - StringRef(Val).split(dbgTypes, ',', -1, false); - for (auto dbgType : dbgTypes) - CurrentDebugType->push_back(std::string(dbgType)); + SmallVector<StringRef, 8> DbgTypes; + StringRef(Val).split(DbgTypes, ',', -1, false); + for (auto DbgType : DbgTypes) + CurrentDebugType->push_back(parseDebugType(DbgType)); } }; } // namespace @@ -129,8 +169,13 @@ struct CreateDebugOnly { static void *call() { return new cl::opt<DebugOnlyOpt, true, cl::parser<std::string>>( "debug-only", - cl::desc("Enable a specific type of debug output (comma separated list " - "of types)"), + cl::desc( + "Enable a specific type of debug output (comma separated list " + "of types using the format \"type[:level]\", where the level " + "is an optional integer. The level can be set to 1, 2, 3, etc. to " + "control the verbosity of the output. Setting a debug-type level " + "to zero acts as an opt-out for this specific debug-type without " + "affecting the others."), cl::Hidden, cl::value_desc("debug string"), cl::location(DebugOnlyOptLoc), cl::ValueRequired); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 49d8b44..59cc1df 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,7 +13,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f25ce87..6118933 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4846,94 +4846,11 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } -// Detect when CMP and SELECT use the same constant and fold them to avoid -// loading the constant twice. Specifically handles patterns like: -// %cmp = icmp eq i32 %val, 4242 -// %sel = select i1 %cmp, i32 4242, i32 %other -// It can be optimized to reuse %val instead of 4242 in select. -static SDValue -foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AMDGPUSubtarget *ST) { - SDValue Cond = N->getOperand(0); - SDValue TrueVal = N->getOperand(1); - SDValue FalseVal = N->getOperand(2); - - // Check if condition is a comparison. - if (Cond.getOpcode() != ISD::SETCC) - return SDValue(); - - SDValue LHS = Cond.getOperand(0); - SDValue RHS = Cond.getOperand(1); - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); - bool isInteger = LHS.getValueType().isInteger(); - - // Handle simple floating-point and integer types only. - if (!isFloatingPoint && !isInteger) - return SDValue(); - - bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); - bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); - if (!isEquality && !isNonEquality) - return SDValue(); - - SDValue ArgVal, ConstVal; - if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || - (isInteger && isa<ConstantSDNode>(RHS))) { - ConstVal = RHS; - ArgVal = LHS; - } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || - (isInteger && isa<ConstantSDNode>(LHS))) { - ConstVal = LHS; - ArgVal = RHS; - } else { - return SDValue(); - } - - // Check if constant should not be optimized - early return if not. - if (isFloatingPoint) { - const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); - const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); - - // Only optimize normal floating-point values (finite, non-zero, and - // non-subnormal as per IEEE 754), skip optimization for inlinable - // floating-point constants. - if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) - return SDValue(); - } else { - int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); - - // Skip optimization for inlinable integer immediates. - // Inlinable immediates include: -16 to 64 (inclusive). - if (IntVal >= -16 && IntVal <= 64) - return SDValue(); - } - - // For equality and non-equality comparisons, patterns: - // select (setcc x, const), const, y -> select (setcc x, const), x, y - // select (setccinv x, const), y, const -> select (setccinv x, const), y, x - if (!(isEquality && TrueVal == ConstVal) && - !(isNonEquality && FalseVal == ConstVal)) - return SDValue(); - - SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; - SDValue SelectRHS = - (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; - return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, - SelectLHS, SelectRHS); -} - SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; - // Try to fold CMP + SELECT patterns with shared constants (both FP and - // integer). - if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) - return Folded; - SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c865082..38f9ee5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -836,8 +836,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // When we are not using -fgpu-rdc, we can run accelerator code // selection relatively early, but still after linking to prevent // eager removal of potentially reachable symbols. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } PM.addPass(AMDGPUPrintfRuntimeBindingPass()); } @@ -916,8 +918,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { // selection after linking to prevent, otherwise we end up removing // potentially reachable symbols that were exported as external in other // modules. - if (EnableHipStdPar) + if (EnableHipStdPar) { + PM.addPass(HipStdParMathFixupPass()); PM.addPass(HipStdParAcceleratorCodeSelectionPass()); + } // We want to support the -lto-partitions=N option as "best effort". // For that, we need to lower LDS earlier in the pipeline before the // module is partitioned for codegen. diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7207c25..0f172e0d 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -369,31 +369,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> { } } -class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< +// Async loads, introduced in gfx1250, will store directly +// to a DS address in vdst (they will not use M0 for DS addess). +class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo< opName, (outs ), !con( - !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), - (ins flat_offset:$offset, CPol_0:$cpol)), - " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { - let LGKM_CNT = 1; + !if(IsAsync, (ins VGPR_32:$vdst), (ins)), + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), + (ins flat_offset:$offset, CPol_0:$cpol)), + !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let LGKM_CNT = !not(IsAsync); + let VM_CNT = !not(IsAsync); + let ASYNC_CNT = IsAsync; let is_flat_global = 1; let lds = 1; let has_data = 0; + let has_vdst = IsAsync; // vdst for ds address with IsAsync + let mayLoad = 1; + let mayStore = 1; + let has_saddr = 1; + let enabled_saddr = EnableSaddr; + let VALU = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); + let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]); + let Defs = !if(IsAsync, [ASYNCcnt], []); + let SchedRW = [WriteVMEM, WriteLDS]; +} + +multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> { + def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>, + GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>, + GlobalSaddrTable<1, opName>; +} + +class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo< + opName, + (outs ), + !con( + !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata), + (ins flat_offset:$offset, CPol_0:$cpol)), + " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { + let VM_CNT = 0; + let ASYNC_CNT = 1; + let is_flat_global = 1; + let lds = 1; + let has_data = 1; // vdata for ds address let has_vdst = 0; let mayLoad = 1; let mayStore = 1; let has_saddr = 1; let enabled_saddr = EnableSaddr; let VALU = 1; - let Uses = [M0, EXEC]; + let Uses = [EXEC, ASYNCcnt]; + let Defs = [ASYNCcnt]; let SchedRW = [WriteVMEM, WriteLDS]; } -multiclass FLAT_Global_Load_LDS_Pseudo<string opName> { - def "" : FLAT_Global_Load_LDS_Pseudo<opName>, +multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> { + def "" : FLAT_Global_STORE_LDS_Pseudo<opName>, GlobalSaddrTable<0, opName>; - def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>, + def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>, GlobalSaddrTable<1, opName>; } @@ -1156,6 +1193,15 @@ let SubtargetPredicate = isGFX12Plus in { let SubtargetPredicate = isGFX1250Plus in { +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">; + def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>; def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">; } // End SubtargetPredicate = isGFX1250Plus @@ -3374,6 +3420,15 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>; defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>; defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>; +defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>; +defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>; + defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">; defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8d51ec6..9017f4f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15896,6 +15896,78 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +SDValue SITargetLowering::performSelectCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + // Detect when CMP and SELECT use the same constant and fold them to avoid + // loading the constant twice. Specifically handles patterns like: + // %cmp = icmp eq i32 %val, 4242 + // %sel = select i1 %cmp, i32 4242, i32 %other + // It can be optimized to reuse %val instead of 4242 in select. + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Skip optimization for inlinable immediates. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + if (AMDGPU::isInlinableIntLiteral( + cast<ConstantSDNode>(ConstVal)->getSExtValue())) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -15944,6 +16016,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFMulCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); + case ISD::SELECT: + if (auto Res = performSelectCombine(N, DCI)) + return Res; + break; case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::FMAXNUM_IEEE: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index acf6158..dedd9ae 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -211,6 +211,7 @@ private: SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const; unsigned getFusedOpcode(const SelectionDAG &DAG, diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 607825e..f1262e11 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -321,8 +321,7 @@ public: bool IsNonTemporal, bool IsLastUse = false) const = 0; - virtual bool finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const { + virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -603,8 +602,7 @@ public: bool IsVolatile, bool IsNonTemporal, bool IsLastUse) const override; - bool finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const override; + bool finalizeStore(MachineInstr &MI, bool Atomic) const override; bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, @@ -2538,9 +2536,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( if (IsVolatile) { Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); - if (Op == SIMemOp::STORE) - Changed |= insertWaitsBeforeSystemScopeStore(MI); - // Ensure operation has completed at system scope to cause all volatile // operations to be visible outside the program in a global order. Do not // request cross address space as only the global address space can be @@ -2553,9 +2548,8 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( return Changed; } -bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, - bool Atomic) const { - MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); +bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const { + MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol); if (!CPol) return false; @@ -2570,7 +2564,7 @@ bool SIGfx12CacheControl::finalizeStore(MachineBasicBlock::iterator &MI, // GFX12.5 only: Require SCOPE_SE on stores that may hit the scratch address // space. - if (TII->mayAccessScratchThroughFlat(*MI) && Scope == CPol::SCOPE_CU) + if (TII->mayAccessScratchThroughFlat(MI) && Scope == CPol::SCOPE_CU) return setScope(MI, CPol::SCOPE_SE); return false; @@ -2674,6 +2668,8 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; + // FIXME: Necessary hack because iterator can lose track of the store. + MachineInstr &StoreMI = *MI; if (MOI.isAtomic()) { if (MOI.getOrdering() == AtomicOrdering::Monotonic || @@ -2690,7 +2686,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, MOI.getIsCrossAddressSpaceOrdering(), Position::BEFORE); - Changed |= CC->finalizeStore(MI, /*Atomic=*/true); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true); return Changed; } @@ -2703,7 +2699,7 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, // GFX12 specific, scope(desired coherence domain in cache hierarchy) is // instruction field, do not confuse it with atomic scope. - Changed |= CC->finalizeStore(MI, /*Atomic=*/false); + Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b5b3cc9..83e63ac 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) { } bool isAsyncStore(unsigned Opc) { - return false; // placeholder before async store implementation. + return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 || + Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250; } bool isTensorStore(unsigned Opc) { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 50217c3..9e4dbec 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -4261,8 +4261,7 @@ std::optional<unsigned> ARMBaseInstrInfo::getOperandLatencyImpl( // instructions). if (Latency > 0 && Subtarget.isThumb2()) { const MachineFunction *MF = DefMI.getParent()->getParent(); - // FIXME: Use Function::hasOptSize(). - if (MF->getFunction().hasFnAttribute(Attribute::OptimizeForSize)) + if (MF->getFunction().hasOptSize()) --Latency; } return Latency; diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index a87b9a2..bed6bc9 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -957,47 +957,47 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) { return; } - // MapDef type may be a struct type or a non-pointer derived type - const DIType *OrigTy = Ty; - while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) { - auto Tag = DTy->getTag(); - if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type && - Tag != dwarf::DW_TAG_volatile_type && - Tag != dwarf::DW_TAG_restrict_type) - break; - Ty = DTy->getBaseType(); - } - - const auto *CTy = dyn_cast<DICompositeType>(Ty); - if (!CTy) - return; - - auto Tag = CTy->getTag(); - if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl()) - return; - - // Visit all struct members to ensure their types are visited. - const DINodeArray Elements = CTy->getElements(); - for (const auto *Element : Elements) { - const auto *MemberType = cast<DIDerivedType>(Element); - const DIType *MemberBaseType = MemberType->getBaseType(); - - // If the member is a composite type, that may indicate the currently - // visited composite type is a wrapper, and the member represents the - // actual map definition. - // In that case, visit the member with `visitMapDefType` instead of - // `visitTypeEntry`, treating it specifically as a map definition rather - // than as a regular composite type. - const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); - if (MemberCTy) { - visitMapDefType(MemberBaseType, TypeId); - } else { - visitTypeEntry(MemberBaseType); + uint32_t TmpId; + switch (Ty->getTag()) { + case dwarf::DW_TAG_typedef: + case dwarf::DW_TAG_const_type: + case dwarf::DW_TAG_volatile_type: + case dwarf::DW_TAG_restrict_type: + case dwarf::DW_TAG_pointer_type: + visitMapDefType(dyn_cast<DIDerivedType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_array_type: + // Visit nested map array and jump to the element type + visitMapDefType(dyn_cast<DICompositeType>(Ty)->getBaseType(), TmpId); + break; + case dwarf::DW_TAG_structure_type: { + // Visit all struct members to ensure their types are visited. + const auto *CTy = cast<DICompositeType>(Ty); + const DINodeArray Elements = CTy->getElements(); + for (const auto *Element : Elements) { + const auto *MemberType = cast<DIDerivedType>(Element); + const DIType *MemberBaseType = MemberType->getBaseType(); + // If the member is a composite type, that may indicate the currently + // visited composite type is a wrapper, and the member represents the + // actual map definition. + // In that case, visit the member with `visitMapDefType` instead of + // `visitTypeEntry`, treating it specifically as a map definition rather + // than as a regular composite type. + const auto *MemberCTy = dyn_cast<DICompositeType>(MemberBaseType); + if (MemberCTy) { + visitMapDefType(MemberBaseType, TmpId); + } else { + visitTypeEntry(MemberBaseType); + } } + break; + } + default: + break; } // Visit this type, struct or a const/typedef/volatile/restrict type - visitTypeEntry(OrigTy, TypeId, false, false); + visitTypeEntry(Ty, TypeId, false, false); } /// Read file contents from the actual file or from the source diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp index c86fa2b..54c3cea 100644 --- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp @@ -457,7 +457,7 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) { TII = ST->getInstrInfo(); const Function &F = MF.getFunction(); - bool OptForSize = F.hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = F.hasOptSize(); // Combine aggressively (for code size) ShouldCombineAggressively = diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp index 6eccf80..9d7776d 100644 --- a/llvm/lib/Target/Hexagon/HexagonMask.cpp +++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp @@ -76,7 +76,7 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) { HII = HST.getInstrInfo(); const Function &F = MF.getFunction(); - if (!F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.hasOptSize()) return false; // Mask instruction is available only from v66 if (!HST.hasV66Ops()) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 65e7c56..96f52275 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -145,18 +145,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { if (tryStoreVector(N)) return; break; - case NVPTXISD::LoadParam: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadParamV4: - if (tryLoadParam(N)) - return; - break; - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - if (tryStoreParam(N)) - return; - break; case ISD::INTRINSIC_W_CHAIN: if (tryIntrinsicChain(N)) return; @@ -1462,267 +1450,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return true; } -bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { - SDValue Chain = Node->getOperand(0); - SDValue Offset = Node->getOperand(2); - SDValue Glue = Node->getOperand(3); - SDLoc DL(Node); - MemSDNode *Mem = cast<MemSDNode>(Node); - - unsigned VecSize; - switch (Node->getOpcode()) { - default: - return false; - case NVPTXISD::LoadParam: - VecSize = 1; - break; - case NVPTXISD::LoadParamV2: - VecSize = 2; - break; - case NVPTXISD::LoadParamV4: - VecSize = 4; - break; - } - - EVT EltVT = Node->getValueType(0); - EVT MemVT = Mem->getMemoryVT(); - - std::optional<unsigned> Opcode; - - switch (VecSize) { - default: - return false; - case 1: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16, - NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64); - break; - case 2: - Opcode = - pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8, - NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32, - NVPTX::LoadParamMemV2I64); - break; - case 4: - Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, - NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16, - NVPTX::LoadParamMemV4I32, {/* no v4i64 */}); - break; - } - if (!Opcode) - return false; - - SDVTList VTs; - if (VecSize == 1) { - VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); - } else if (VecSize == 2) { - VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); - } else { - EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; - VTs = CurDAG->getVTList(EVTs); - } - - unsigned OffsetVal = Offset->getAsZExtVal(); - - SmallVector<SDValue, 2> Ops( - {CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops)); - return true; -} - -// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri) -#define getOpcV2H(ty, opKind0, opKind1) \ - NVPTX::StoreParamV2##ty##_##opKind0##opKind1 - -#define getOpcV2H1(ty, opKind0, isImm1) \ - (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r) - -#define getOpcodeForVectorStParamV2(ty, isimm) \ - (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1]) - -#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \ - NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3 - -#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \ - (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \ - : getOpcV4H(ty, opKind0, opKind1, opKind2, r) - -#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \ - (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \ - : getOpcV4H3(ty, opKind0, opKind1, r, isImm3) - -#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \ - (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \ - : getOpcV4H2(ty, opKind0, r, isImm2, isImm3) - -#define getOpcodeForVectorStParamV4(ty, isimm) \ - (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \ - : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3]) - -#define getOpcodeForVectorStParam(n, ty, isimm) \ - (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \ - : getOpcodeForVectorStParamV4(ty, isimm) - -static unsigned pickOpcodeForVectorStParam(SmallVector<SDValue, 8> &Ops, - unsigned NumElts, - MVT::SimpleValueType MemTy, - SelectionDAG *CurDAG, SDLoc DL) { - // Determine which inputs are registers and immediates make new operators - // with constant values - SmallVector<bool, 4> IsImm(NumElts, false); - for (unsigned i = 0; i < NumElts; i++) { - IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i])); - if (IsImm[i]) { - SDValue Imm = Ops[i]; - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[i] = Imm; - } - } - - // Get opcode for MemTy, size, and register/immediate operand ordering - switch (MemTy) { - case MVT::i8: - return getOpcodeForVectorStParam(NumElts, I8, IsImm); - case MVT::i16: - return getOpcodeForVectorStParam(NumElts, I16, IsImm); - case MVT::i32: - return getOpcodeForVectorStParam(NumElts, I32, IsImm); - case MVT::i64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(I64, IsImm); - case MVT::f32: - return getOpcodeForVectorStParam(NumElts, F32, IsImm); - case MVT::f64: - assert(NumElts == 2 && "MVT too large for NumElts > 2"); - return getOpcodeForVectorStParamV2(F64, IsImm); - - // These cases don't support immediates, just use the all register version - // and generate moves. - case MVT::i1: - return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr - : NVPTX::StoreParamV4I8_rrrr; - case MVT::f16: - case MVT::bf16: - return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr - : NVPTX::StoreParamV4I16_rrrr; - case MVT::v2f16: - case MVT::v2bf16: - case MVT::v2i16: - case MVT::v4i8: - return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr - : NVPTX::StoreParamV4I32_rrrr; - default: - llvm_unreachable("Cannot select st.param for unknown MemTy"); - } -} - -bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { - SDLoc DL(N); - SDValue Chain = N->getOperand(0); - SDValue Param = N->getOperand(1); - unsigned ParamVal = Param->getAsZExtVal(); - SDValue Offset = N->getOperand(2); - unsigned OffsetVal = Offset->getAsZExtVal(); - MemSDNode *Mem = cast<MemSDNode>(N); - SDValue Glue = N->getOperand(N->getNumOperands() - 1); - - // How many elements do we have? - unsigned NumElts; - switch (N->getOpcode()) { - default: - llvm_unreachable("Unexpected opcode"); - case NVPTXISD::StoreParam: - NumElts = 1; - break; - case NVPTXISD::StoreParamV2: - NumElts = 2; - break; - case NVPTXISD::StoreParamV4: - NumElts = 4; - break; - } - - // Build vector of operands - SmallVector<SDValue, 8> Ops; - for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(N->getOperand(i + 3)); - Ops.append({CurDAG->getTargetConstant(ParamVal, DL, MVT::i32), - CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32), Chain, Glue}); - - // Determine target opcode - // If we have an i1, use an 8-bit store. The lowering code in - // NVPTXISelLowering will have already emitted an upcast. - std::optional<unsigned> Opcode; - switch (NumElts) { - default: - llvm_unreachable("Unexpected NumElts"); - case 1: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - SDValue Imm = Ops[0]; - if (MemTy != MVT::f16 && MemTy != MVT::bf16 && - (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) { - // Convert immediate to target constant - if (MemTy == MVT::f32 || MemTy == MVT::f64) { - const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm); - const ConstantFP *CF = ConstImm->getConstantFPValue(); - Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0)); - } else { - const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm); - const ConstantInt *CI = ConstImm->getConstantIntValue(); - Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0)); - } - Ops[0] = Imm; - // Use immediate version of store param - Opcode = - pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i, NVPTX::StoreParamI16_i, - NVPTX::StoreParamI32_i, NVPTX::StoreParamI64_i); - } else - Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, - NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r, - NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r); - if (Opcode == NVPTX::StoreParamI8_r) { - // Fine tune the opcode depending on the size of the operand. - // This helps to avoid creating redundant COPY instructions in - // InstrEmitter::AddRegisterOperand(). - switch (Ops[0].getSimpleValueType().SimpleTy) { - default: - break; - case MVT::i32: - Opcode = NVPTX::StoreParamI8TruncI32_r; - break; - case MVT::i64: - Opcode = NVPTX::StoreParamI8TruncI64_r; - break; - } - } - break; - } - case 2: - case 4: { - MVT::SimpleValueType MemTy = Mem->getMemoryVT().getSimpleVT().SimpleTy; - Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL); - break; - } - } - - SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops); - MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef}); - - ReplaceNode(N, Ret); - return true; -} - /// SelectBFE - Look for instruction sequences that can be made more efficient /// by using the 'bfe' (bit-field extract) PTX instruction bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index b99b4ef..e504a8f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -78,8 +78,6 @@ private: bool tryLDG(MemSDNode *N); bool tryStore(SDNode *N); bool tryStoreVector(SDNode *N); - bool tryLoadParam(SDNode *N); - bool tryStoreParam(SDNode *N); bool tryFence(SDNode *N); void SelectAddrSpaceCast(SDNode *N); bool tryBFE(SDNode *N); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ddcecc00..f79b862 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1075,12 +1075,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::DeclareArrayParam) MAKE_CASE(NVPTXISD::DeclareScalarParam) MAKE_CASE(NVPTXISD::CALL) - MAKE_CASE(NVPTXISD::LoadParam) - MAKE_CASE(NVPTXISD::LoadParamV2) - MAKE_CASE(NVPTXISD::LoadParamV4) - MAKE_CASE(NVPTXISD::StoreParam) - MAKE_CASE(NVPTXISD::StoreParamV2) - MAKE_CASE(NVPTXISD::StoreParamV4) MAKE_CASE(NVPTXISD::MoveParam) MAKE_CASE(NVPTXISD::UNPACK_VECTOR) MAKE_CASE(NVPTXISD::BUILD_VECTOR) @@ -1318,105 +1312,6 @@ Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, return DL.getABITypeAlign(Ty); } -static bool adjustElementType(EVT &ElementType) { - switch (ElementType.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::f16: - case MVT::bf16: - ElementType = MVT::i16; - return true; - case MVT::f32: - case MVT::v2f16: - case MVT::v2bf16: - ElementType = MVT::i32; - return true; - case MVT::f64: - ElementType = MVT::i64; - return true; - } -} - -// Use byte-store when the param address of the argument value is unaligned. -// This may happen when the return value is a field of a packed structure. -// -// This is called in LowerCall() when passing the param values. -static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, - uint64_t Offset, EVT ElementType, - SDValue StVal, SDValue &InGlue, - unsigned ArgID, const SDLoc &dl) { - // Bit logic only works on integer types - if (adjustElementType(ElementType)) - StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal); - - // Store each byte - SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - // Shift the byte to the last byte position - SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, - DAG.getConstant(i * 8, dl, MVT::i32)); - SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - ShiftVal, InGlue}; - // Trunc store only the last byte by using - // st.param.b8 - // The register type can be larger than b8. - Chain = DAG.getMemIntrinsicNode( - NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, - MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); - } - return Chain; -} - -// Use byte-load when the param adress of the returned value is unaligned. -// This may happen when the returned value is a field of a packed structure. -static SDValue -LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, - EVT ElementType, SDValue &InGlue, - SmallVectorImpl<SDValue> &TempProxyRegOps, - const SDLoc &dl) { - // Bit logic only works on integer types - EVT MergedType = ElementType; - adjustElementType(MergedType); - - // Load each byte and construct the whole value. Initial value to 0 - SDValue RetVal = DAG.getConstant(0, dl, MergedType); - // LoadParamMemI8 loads into i16 register only - SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); - for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { - SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), - DAG.getConstant(Offset + i, dl, MVT::i32), - InGlue}; - // This will be selected to LoadParamMemI8 - SDValue LdVal = - DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, - MVT::i8, MachinePointerInfo(), Align(1)); - SDValue TmpLdVal = LdVal.getValue(0); - Chain = LdVal.getValue(1); - InGlue = LdVal.getValue(2); - - TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl, - TmpLdVal.getSimpleValueType(), TmpLdVal); - TempProxyRegOps.push_back(TmpLdVal); - - SDValue CMask = DAG.getConstant(255, dl, MergedType); - SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); - // Need to extend the i16 register to the whole width. - TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal); - // Mask off the high bits. Leave only the lower 8bits. - // Do this because we are using loadparam.b8. - TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask); - // Shift and merge - TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift); - RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal); - } - if (ElementType != MergedType) - RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal); - - return RetVal; -} - static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func) { if (!Func) @@ -1483,10 +1378,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SelectionDAG &DAG = CLI.DAG; SDLoc dl = CLI.DL; - SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; + const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; SDValue Callee = CLI.Callee; - bool &isTailCall = CLI.IsTailCall; ArgListTy &Args = CLI.getArgs(); Type *RetTy = CLI.RetTy; const CallBase *CB = CLI.CB; @@ -1496,6 +1389,36 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return DAG.getConstant(I, dl, MVT::i32); }; + const unsigned UniqueCallSite = GlobalUniqueCallSite++; + const SDValue CallChain = CLI.Chain; + const SDValue StartChain = + DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl); + SDValue DeclareGlue = StartChain.getValue(1); + + SmallVector<SDValue, 16> CallPrereqs{StartChain}; + + const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) { + // PTX ABI requires integral types to be at least 32 bits in size. FP16 is + // loaded/stored using i16, so it's handled here as well. + const unsigned SizeBits = promoteScalarArgumentSize(Size * 8); + SDValue Declare = + DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(SizeBits), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + + const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align, + unsigned Size) { + SDValue Declare = DAG.getNode( + NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, + {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue}); + CallPrereqs.push_back(Declare); + DeclareGlue = Declare.getValue(1); + return Declare; + }; + // Variadic arguments. // // Normally, for each argument, we declare a param scalar or a param @@ -1511,15 +1434,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // // After all vararg is processed, 'VAOffset' holds the size of the // vararg byte array. + assert((CLI.IsVarArg || CLI.Args.size() == CLI.NumFixedArgs) && + "Non-VarArg function with extra arguments"); - SDValue VADeclareParam; // vararg byte array const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic - unsigned VAOffset = 0; // current offset in the param array + unsigned VAOffset = 0; // current offset in the param array - const unsigned UniqueCallSite = GlobalUniqueCallSite++; - SDValue TempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl); - SDValue InGlue = Chain.getValue(1); + const SDValue VADeclareParam = + CLI.Args.size() > FirstVAArg + ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32), + Align(STI.getMaxRequiredAlignment()), 0) + : SDValue(); // Args.size() and Outs.size() need not match. // Outs.size() will be larger @@ -1580,43 +1505,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((!IsByVal || TypeSize == ArgOuts[0].Flags.getByValSize()) && "type size mismatch"); - const std::optional<SDValue> ArgDeclare = [&]() -> std::optional<SDValue> { - if (IsVAArg) { - if (ArgI == FirstVAArg) { - VADeclareParam = DAG.getNode( - NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(STI.getMaxRequiredAlignment()), - GetI32(0), InGlue}); - return VADeclareParam; - } - return std::nullopt; - } - if (IsByVal || shouldPassAsArray(Arg.Ty)) { - // declare .param .align <align> .b8 .param<n>[<size>]; - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(ArgAlign.value()), - GetI32(TypeSize), InGlue}); - } + const SDValue ArgDeclare = [&]() { + if (IsVAArg) + return VADeclareParam; + + if (IsByVal || shouldPassAsArray(Arg.Ty)) + return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TypeSize); + assert(ArgOuts.size() == 1 && "We must pass only one value as non-array"); - // declare .param .b<size> .param<n>; - - // PTX ABI requires integral types to be at least 32 bits in - // size. FP16 is loaded/stored using i16, so it's handled - // here as well. - const unsigned PromotedSize = - (ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) - ? promoteScalarArgumentSize(TypeSize * 8) - : TypeSize * 8; - - return DAG.getNode(NVPTXISD::DeclareScalarParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, ParamSymbol, GetI32(PromotedSize), InGlue}); + assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) && + "Only int and float types are supported as non-array arguments"); + + return MakeDeclareScalarParam(ParamSymbol, TypeSize); }(); - if (ArgDeclare) { - Chain = ArgDeclare->getValue(0); - InGlue = ArgDeclare->getValue(1); - } // PTX Interoperability Guide 3.3(A): [Integer] Values shorter // than 32-bits are sign extended or zero extended, depending on @@ -1626,36 +1527,25 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32; const auto GetStoredValue = [&](const unsigned I, EVT EltVT, - const Align PartAlign) { - SDValue StVal; + const MaybeAlign PartAlign) { if (IsByVal) { SDValue Ptr = ArgOutVals[0]; auto MPI = refinePtrAS(Ptr, DAG, DL, *this); SDValue SrcAddr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(Offsets[I])); - StVal = DAG.getLoad(EltVT, dl, TempChain, SrcAddr, MPI, PartAlign); - } else { - StVal = ArgOutVals[I]; - - auto PromotedVT = promoteScalarIntegerPTX(StVal.getValueType()); - if (PromotedVT != StVal.getValueType()) { - StVal = DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, PromotedVT, - StVal); - } + return DAG.getLoad(EltVT, dl, CallChain, SrcAddr, MPI, PartAlign); } + SDValue StVal = ArgOutVals[I]; + assert(promoteScalarIntegerPTX(StVal.getValueType()) == + StVal.getValueType() && + "OutVal type should always be legal"); - if (ExtendIntegerParam) { - assert(VTs.size() == 1 && "Scalar can't have multiple parts."); - // zext/sext to i32 - StVal = - DAG.getNode(getExtOpcode(ArgOuts[I].Flags), dl, MVT::i32, StVal); - } else if (EltVT.getSizeInBits() < 16) { - // Use 16-bit registers for small stores as it's the - // smallest general purpose register size supported by NVPTX. - StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); - } - return StVal; + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT StoreVT = + ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); + + return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl); }; const auto VectorInfo = @@ -1664,23 +1554,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned J = 0; for (const unsigned NumElts : VectorInfo) { const int CurOffset = Offsets[J]; - EVT EltVT = promoteScalarIntegerPTX(VTs[J]); - const Align PartAlign = commonAlignment(ArgAlign, CurOffset); - - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar store. In such cases, fall back to byte stores. - if (NumElts == 1 && !IsVAArg && PartAlign < DAG.getEVTAlign(EltVT)) { - - SDValue StVal = GetStoredValue(J, EltVT, PartAlign); - Chain = LowerUnalignedStoreParam(DAG, Chain, - CurOffset + (IsByVal ? VAOffset : 0), - EltVT, StVal, InGlue, ArgI, dl); - - // LowerUnalignedStoreParam took care of inserting the necessary nodes - // into the SDAG, so just move on to the next element. - J++; - continue; - } + const EVT EltVT = promoteScalarIntegerPTX(VTs[J]); if (IsVAArg && !IsByVal) // Align each part of the variadic argument to their type. @@ -1688,44 +1562,45 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert((IsVAArg || VAOffset == 0) && "VAOffset must be 0 for non-VA args"); - SmallVector<SDValue, 6> StoreOperands{ - Chain, GetI32(IsVAArg ? FirstVAArg : ArgI), - GetI32(VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset))}; - // Record the values to store. - for (const unsigned K : llvm::seq(NumElts)) - StoreOperands.push_back(GetStoredValue(J + K, EltVT, PartAlign)); - StoreOperands.push_back(InGlue); + const unsigned Offset = + (VAOffset + ((IsVAArg && !IsByVal) ? 0 : CurOffset)); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset)); - NVPTXISD::NodeType Op; - switch (NumElts) { - case 1: - Op = NVPTXISD::StoreParam; - break; - case 2: - Op = NVPTXISD::StoreParamV2; - break; - case 4: - Op = NVPTXISD::StoreParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); + const MaybeAlign CurrentAlign = ExtendIntegerParam + ? MaybeAlign(std::nullopt) + : commonAlignment(ArgAlign, Offset); + + SDValue Val; + if (NumElts == 1) { + Val = GetStoredValue(J, EltVT, CurrentAlign); + } else { + SmallVector<SDValue, 8> StoreVals; + for (const unsigned K : llvm::seq(NumElts)) { + SDValue ValJ = GetStoredValue(J + K, EltVT, CurrentAlign); + if (ValJ.getValueType().isVector()) + DAG.ExtractVectorElements(ValJ, StoreVals); + else + StoreVals.push_back(ValJ); + } + + EVT VT = EVT::getVectorVT( + *DAG.getContext(), StoreVals[0].getValueType(), StoreVals.size()); + Val = DAG.getBuildVector(VT, dl, StoreVals); } - // Adjust type of the store op if we've extended the scalar - // return value. - EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; - Chain = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), PartAlign, - MachineMemOperand::MOStore); - InGlue = Chain.getValue(1); + SDValue StoreParam = + DAG.getStore(ArgDeclare, dl, Val, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); + CallPrereqs.push_back(StoreParam); // TODO: We may need to support vector types that can be passed // as scalars in variadic arguments. if (IsVAArg && !IsByVal) { assert(NumElts == 1 && "Vectorization is expected to be disabled for variadics."); + const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(*DAG.getContext())); } @@ -1736,33 +1611,21 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VAOffset += TypeSize; } - GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); - // Handle Result if (!Ins.empty()) { - const SDValue RetDeclare = [&]() { - const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); - const unsigned ResultSize = DL.getTypeAllocSizeInBits(RetTy); - if (shouldPassAsArray(RetTy)) { - const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); - return DAG.getNode(NVPTXISD::DeclareArrayParam, dl, - {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(RetAlign.value()), - GetI32(ResultSize / 8), InGlue}); - } - const auto PromotedResultSize = promoteScalarArgumentSize(ResultSize); - return DAG.getNode( - NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue}, - {Chain, RetSymbol, GetI32(PromotedResultSize), InGlue}); - }(); - Chain = RetDeclare.getValue(0); - InGlue = RetDeclare.getValue(1); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + const unsigned ResultSize = DL.getTypeAllocSize(RetTy); + if (shouldPassAsArray(RetTy)) { + const Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL); + MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize); + } else { + MakeDeclareScalarParam(RetSymbol, ResultSize); + } } - const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); // Set the size of the vararg param byte array if the callee is a variadic // function and the variadic part is not empty. - if (HasVAArgs) { + if (VADeclareParam) { SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), VADeclareParam.getOperand(2), GetI32(VAOffset), @@ -1771,6 +1634,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, VADeclareParam->getVTList(), DeclareParamOps); } + const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); // If the type of the callsite does not match that of the function, convert // the callsite to an indirect call. const bool ConvertToIndirectCall = shouldConvertToIndirectCall(CB, Func); @@ -1800,15 +1664,16 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // instruction. // The prototype is embedded in a string and put as the operand for a // CallPrototype SDNode which will print out to the value of the string. + const bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); std::string Proto = getPrototype(DL, RetTy, Args, CLI.Outs, HasVAArgs ? std::optional(FirstVAArg) : std::nullopt, *CB, UniqueCallSite); const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); - Chain = DAG.getNode( - NVPTXISD::CallPrototype, dl, {MVT::Other, MVT::Glue}, - {Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InGlue}); - InGlue = Chain.getValue(1); + const SDValue PrototypeDeclare = DAG.getNode( + NVPTXISD::CallPrototype, dl, MVT::Other, + {StartChain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32)}); + CallPrereqs.push_back(PrototypeDeclare); } if (ConvertToIndirectCall) { @@ -1826,24 +1691,15 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const unsigned NumArgs = std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size()); /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) - Chain = DAG.getNode(NVPTXISD::CALL, dl, {MVT::Other, MVT::Glue}, - {Chain, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), - GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, - GetI32(Proto), InGlue}); - InGlue = Chain.getValue(1); - + /// NumParams, Callee, Proto) + const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs); + const SDValue Call = DAG.getNode( + NVPTXISD::CALL, dl, MVT::Other, + {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall), + GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)}); + + SmallVector<SDValue, 16> LoadChains{Call}; SmallVector<SDValue, 16> ProxyRegOps; - // An item of the vector is filled if the element does not need a ProxyReg - // operation on it and should be added to InVals as is. ProxyRegOps and - // ProxyRegTruncates contain empty/none items at the same index. - SmallVector<SDValue, 16> RetElts; - // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` - // to use the values of `LoadParam`s and to be replaced later then - // `CALLSEQ_END` is added. - SmallVector<SDValue, 16> TempProxyRegOps; - - // Generate loads from param memory/moves from registers for result if (!Ins.empty()) { SmallVector<EVT, 16> VTs; SmallVector<uint64_t, 16> Offsets; @@ -1860,104 +1716,65 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign); unsigned I = 0; - for (const unsigned VectorizedSize : VectorInfo) { - EVT TheLoadType = promoteScalarIntegerPTX(VTs[I]); - EVT EltType = Ins[I].VT; - const Align EltAlign = commonAlignment(RetAlign, Offsets[I]); - - if (TheLoadType != VTs[I]) - EltType = TheLoadType; - - if (ExtendIntegerRetVal) { - TheLoadType = MVT::i32; - EltType = MVT::i32; - } else if (TheLoadType.getSizeInBits() < 16) { - EltType = MVT::i16; - } + for (const unsigned NumElts : VectorInfo) { + const MaybeAlign CurrentAlign = + ExtendIntegerRetVal ? MaybeAlign(std::nullopt) + : commonAlignment(RetAlign, Offsets[I]); - // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a - // scalar load. In such cases, fall back to byte loads. - if (VectorizedSize == 1 && RetTy->isAggregateType() && - EltAlign < DAG.getEVTAlign(TheLoadType)) { - SDValue Ret = LowerUnalignedLoadRetParam( - DAG, Chain, Offsets[I], TheLoadType, InGlue, TempProxyRegOps, dl); - ProxyRegOps.push_back(SDValue()); - RetElts.resize(I); - RetElts.push_back(Ret); - - I++; - continue; - } + const EVT VTI = promoteScalarIntegerPTX(VTs[I]); + const EVT LoadVT = + ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI); - SmallVector<EVT, 6> LoadVTs(VectorizedSize, EltType); - LoadVTs.append({MVT::Other, MVT::Glue}); + const unsigned PackingAmt = + LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; - NVPTXISD::NodeType Op; - switch (VectorizedSize) { - case 1: - Op = NVPTXISD::LoadParam; - break; - case 2: - Op = NVPTXISD::LoadParamV2; - break; - case 4: - Op = NVPTXISD::LoadParamV4; - break; - default: - llvm_unreachable("Invalid vector info."); - } + const EVT VecVT = NumElts == 1 ? LoadVT + : EVT::getVectorVT(*DAG.getContext(), + LoadVT.getScalarType(), + NumElts * PackingAmt); - SDValue LoadOperands[] = {Chain, GetI32(1), GetI32(Offsets[I]), InGlue}; - SDValue RetVal = DAG.getMemIntrinsicNode( - Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, - MachinePointerInfo(), EltAlign, MachineMemOperand::MOLoad); + const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32); + SDValue Ptr = + DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I])); - for (const unsigned J : llvm::seq(VectorizedSize)) { - ProxyRegOps.push_back(RetVal.getValue(J)); - } + SDValue R = + DAG.getLoad(VecVT, dl, Call, Ptr, + MachinePointerInfo(ADDRESS_SPACE_PARAM), CurrentAlign); - Chain = RetVal.getValue(VectorizedSize); - InGlue = RetVal.getValue(VectorizedSize + 1); + LoadChains.push_back(R.getValue(1)); - I += VectorizedSize; + if (NumElts == 1) + ProxyRegOps.push_back(R); + else + for (const unsigned J : llvm::seq(NumElts)) { + SDValue Elt = DAG.getNode( + LoadVT.isVector() ? ISD::EXTRACT_SUBVECTOR + : ISD::EXTRACT_VECTOR_ELT, + dl, LoadVT, R, DAG.getVectorIdxConstant(J * PackingAmt, dl)); + ProxyRegOps.push_back(Elt); + } + I += NumElts; } } - Chain = - DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl); - InGlue = Chain.getValue(1); + const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains); + const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite, + UniqueCallSite + 1, SDValue(), dl); // Append ProxyReg instructions to the chain to make sure that `callseq_end` // will not get lost. Otherwise, during libcalls expansion, the nodes can become // dangling. - for (const unsigned I : llvm::seq(ProxyRegOps.size())) { - if (I < RetElts.size() && RetElts[I]) { - InVals.push_back(RetElts[I]); - continue; - } - - SDValue Ret = - DAG.getNode(NVPTXISD::ProxyReg, dl, ProxyRegOps[I].getSimpleValueType(), - {Chain, ProxyRegOps[I]}); - - const EVT ExpectedVT = Ins[I].VT; - if (!Ret.getValueType().bitsEq(ExpectedVT)) { - Ret = DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, Ret); - } + for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) { + SDValue Proxy = + DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg}); + SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl); InVals.push_back(Ret); } - for (SDValue &T : TempProxyRegOps) { - SDValue Repl = DAG.getNode(NVPTXISD::ProxyReg, dl, T.getSimpleValueType(), - {Chain, T.getOperand(0)}); - DAG.ReplaceAllUsesWith(T, Repl); - DAG.RemoveDeadNode(T.getNode()); - } - - // set isTailCall to false for now, until we figure out how to express + // set IsTailCall to false for now, until we figure out how to express // tail call optimization in PTX - isTailCall = false; - return Chain; + CLI.IsTailCall = false; + return CallEnd; } SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, @@ -5117,10 +4934,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { Operands.push_back(DCI.DAG.getIntPtrConstant( cast<LoadSDNode>(LD)->getExtensionType(), DL)); break; - case NVPTXISD::LoadParamV2: - OldNumOutputs = 2; - Opcode = NVPTXISD::LoadParamV4; - break; case NVPTXISD::LoadV2: OldNumOutputs = 2; Opcode = NVPTXISD::LoadV4; @@ -5201,12 +5014,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV2; break; - case NVPTXISD::StoreParam: - Opcode = NVPTXISD::StoreParamV2; - break; - case NVPTXISD::StoreParamV2: - Opcode = NVPTXISD::StoreParamV4; - break; case NVPTXISD::StoreV2: MemVT = ST->getMemoryVT(); Opcode = NVPTXISD::StoreV4; @@ -5218,7 +5025,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N, return SDValue(); Opcode = NVPTXISD::StoreV8; break; - case NVPTXISD::StoreParamV4: case NVPTXISD::StoreV8: // PTX doesn't support the next doubling of operands return SDValue(); @@ -5263,30 +5069,11 @@ static SDValue combinePackingMovIntoStore(SDNode *N, MemVT, ST->getMemOperand()); } -static SDValue PerformStoreCombineHelper(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - unsigned Front, unsigned Back) { - if (all_of(N->ops().drop_front(Front).drop_back(Back), - [](const SDUse &U) { return U.get()->isUndef(); })) - // Operand 0 is the previous value in the chain. Cannot return EntryToken - // as the previous value will become unused and eliminated later. - return N->getOperand(0); - - return combinePackingMovIntoStore(N, DCI, Front, Back); -} - static SDValue PerformStoreCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return combinePackingMovIntoStore(N, DCI, 1, 2); } -static SDValue PerformStoreParamCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - // Operands from the 3rd to the 2nd last one are the values to be stored. - // {Chain, ArgID, Offset, Val, Glue} - return PerformStoreCombineHelper(N, DCI, 3, 1); -} - /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -5942,6 +5729,86 @@ static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, N->getConstantOperandAPInt(2), N->getConstantOperandVal(3)), SDLoc(N), N->getValueType(0)); + return SDValue(); +} + +// During call lowering we wrap the return values in a ProxyReg node which +// depend on the chain value produced by the completed call. This ensures that +// the full call is emitted in cases where libcalls are used to legalize +// operations. To improve the functioning of other DAG combines we pull all +// operations we can through one of these nodes, ensuring that the ProxyReg +// directly wraps a load. That is: +// +// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0))) +// +static SDValue sinkProxyReg(SDValue R, SDValue Chain, + TargetLowering::DAGCombinerInfo &DCI) { + switch (R.getOpcode()) { + case ISD::TRUNCATE: + case ISD::ANY_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::BITCAST: { + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V); + return SDValue(); + } + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: + case ISD::OR: { + if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI)) + if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI)) + return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B); + return SDValue(); + } + case ISD::Constant: + return R; + case ISD::LOAD: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: { + return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(), + {Chain, R}); + } + case ISD::BUILD_VECTOR: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SmallVector<SDValue, 16> Ops; + for (auto &Op : R->ops()) { + SDValue V = sinkProxyReg(Op, Chain, DCI); + if (!V) + return SDValue(); + Ops.push_back(V); + } + return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops); + } + case ISD::EXTRACT_VECTOR_ELT: { + if (DCI.isBeforeLegalize()) + return SDValue(); + + if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI)) + return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(R), + R.getValueType(), V, R.getOperand(1)); + return SDValue(); + } + default: + return SDValue(); + } +} + +static SDValue combineProxyReg(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + // If the ProxyReg is not wrapping a load, try to pull the operations through + // the ProxyReg. + if (Reg.getOpcode() != ISD::LOAD) { + if (SDValue V = sinkProxyReg(Reg, Chain, DCI)) + return V; + } return SDValue(); } @@ -5965,7 +5832,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FADD: return PerformFADDCombine(N, DCI, OptLevel); case ISD::LOAD: - case NVPTXISD::LoadParamV2: case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: return combineUnpackingMovIntoLoad(N, DCI); @@ -5973,6 +5839,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformMULCombine(N, DCI, OptLevel); case NVPTXISD::PRMT: return combinePRMT(N, DCI, OptLevel); + case NVPTXISD::ProxyReg: + return combineProxyReg(N, DCI); case ISD::SETCC: return PerformSETCCCombine(N, DCI, STI.getSmVersion()); case ISD::SHL: @@ -5980,10 +5848,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SREM: case ISD::UREM: return PerformREMCombine(N, DCI, OptLevel); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); case ISD::STORE: case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: @@ -6332,6 +6196,22 @@ static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, Results.push_back(NewValue.getValue(3)); } +static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI, + SmallVectorImpl<SDValue> &Results) { + SDValue Chain = N->getOperand(0); + SDValue Reg = N->getOperand(1); + + MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType()); + + SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT); + SDValue NewProxy = + DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg}); + SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0)); + + Results.push_back(Res); +} + void NVPTXTargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -6349,6 +6229,9 @@ void NVPTXTargetLowering::ReplaceNodeResults( case ISD::CopyFromReg: ReplaceCopyFromReg_128(N, DAG, Results); return; + case NVPTXISD::ProxyReg: + replaceProxyReg(N, DAG, *this, Results); + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 228e2aa..cf72a1e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -38,7 +38,7 @@ enum NodeType : unsigned { /// This node represents a PTX call instruction. It's operands are as follows: /// /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) CALL, MoveParam, @@ -84,13 +84,7 @@ enum NodeType : unsigned { StoreV2, StoreV4, StoreV8, - LoadParam, - LoadParamV2, - LoadParamV4, - StoreParam, - StoreParamV2, - StoreParamV4, - LAST_MEMORY_OPCODE = StoreParamV4, + LAST_MEMORY_OPCODE = StoreV8, }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 442b900..86d6f7c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1757,12 +1757,6 @@ def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; def SDTDeclareScalarParam : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; -def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>; -def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>; -def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>; -def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>; def SDTMoveParamProfile : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>; def SDTProxyReg : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>]>; @@ -1774,104 +1768,20 @@ def declare_array_param : def declare_scalar_param : SDNode<"NVPTXISD::DeclareScalarParam", SDTDeclareScalarParam, [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -def LoadParam : - SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV2 : - SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def LoadParamV4 : - SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile, - [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>; -def StoreParam : - SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV2 : - SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; -def StoreParamV4 : - SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; def MoveParam : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile, []>; def proxy_reg : SDNode<"NVPTXISD::ProxyReg", SDTProxyReg, [SDNPHasChain]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, - /// NumParams, Callee, Proto, InGlue) + /// NumParams, Callee, Proto) def SDTCallProfile : SDTypeProfile<0, 6, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<5, i32>]>; -def call : - SDNode<"NVPTXISD::CALL", SDTCallProfile, - [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>; - -let mayLoad = true in { - class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst), (ins Offseti32imm:$b), - !strconcat("ld.param", opstr, " \t$dst, [retval0$b];"), - []>; - - class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins Offseti32imm:$b), - !strconcat("ld.param.v2", opstr, - " \t{{$dst, $dst2}}, [retval0$b];"), []>; - - class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> : - NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3, - regclass:$dst4), - (ins Offseti32imm:$b), - !strconcat("ld.param.v4", opstr, - " \t{{$dst, $dst2, $dst3, $dst4}}, [retval0$b];"), - []>; -} - -let mayStore = true in { - - multiclass StoreParamInst<NVPTXRegClass regclass, Operand IMMType, string opstr, bit support_imm = true> { - foreach op = [IMMType, regclass] in - if !or(support_imm, !isa<NVPTXRegClass>(op)) then - def _ # !if(!isa<NVPTXRegClass>(op), "r", "i") - : NVPTXInst<(outs), - (ins op:$val, i32imm:$a, Offseti32imm:$b), - "st.param" # opstr # " \t[param$a$b], $val;", - []>; - } - - multiclass StoreParamV2Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, - i32imm:$a, Offseti32imm:$b), - "st.param.v2" # opstr # " \t[param$a$b], {{$val1, $val2}};", - []>; - } - - multiclass StoreParamV4Inst<NVPTXRegClass regclass, Operand IMMType, string opstr> { - foreach op1 = [IMMType, regclass] in - foreach op2 = [IMMType, regclass] in - foreach op3 = [IMMType, regclass] in - foreach op4 = [IMMType, regclass] in - def _ # !if(!isa<NVPTXRegClass>(op1), "r", "i") - # !if(!isa<NVPTXRegClass>(op2), "r", "i") - # !if(!isa<NVPTXRegClass>(op3), "r", "i") - # !if(!isa<NVPTXRegClass>(op4), "r", "i") - - : NVPTXInst<(outs), - (ins op1:$val1, op2:$val2, op3:$val3, op4:$val4, - i32imm:$a, Offseti32imm:$b), - "st.param.v4" # opstr # - " \t[param$a$b], {{$val1, $val2, $val3, $val4}};", - []>; - } -} +def call : SDNode<"NVPTXISD::CALL", SDTCallProfile, [SDNPHasChain, SDNPSideEffect]>; /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns, -/// NumParams, Callee, Proto, InGlue) +/// NumParams, Callee, Proto) def CallOperand : Operand<i32> { let PrintMethod = "printCallOperand"; } @@ -1908,43 +1818,6 @@ foreach is_convergent = [0, 1] in { (call_uni_inst $addr, imm:$rets, imm:$params)>; } -def LoadParamMemI64 : LoadParamMemInst<B64, ".b64">; -def LoadParamMemI32 : LoadParamMemInst<B32, ".b32">; -def LoadParamMemI16 : LoadParamMemInst<B16, ".b16">; -def LoadParamMemI8 : LoadParamMemInst<B16, ".b8">; -def LoadParamMemV2I64 : LoadParamV2MemInst<B64, ".b64">; -def LoadParamMemV2I32 : LoadParamV2MemInst<B32, ".b32">; -def LoadParamMemV2I16 : LoadParamV2MemInst<B16, ".b16">; -def LoadParamMemV2I8 : LoadParamV2MemInst<B16, ".b8">; -def LoadParamMemV4I32 : LoadParamV4MemInst<B32, ".b32">; -def LoadParamMemV4I16 : LoadParamV4MemInst<B16, ".b16">; -def LoadParamMemV4I8 : LoadParamV4MemInst<B16, ".b8">; - -defm StoreParamI64 : StoreParamInst<B64, i64imm, ".b64">; -defm StoreParamI32 : StoreParamInst<B32, i32imm, ".b32">; -defm StoreParamI16 : StoreParamInst<B16, i16imm, ".b16">; -defm StoreParamI8 : StoreParamInst<B16, i8imm, ".b8">; - -defm StoreParamI8TruncI32 : StoreParamInst<B32, i8imm, ".b8", /* support_imm */ false>; -defm StoreParamI8TruncI64 : StoreParamInst<B64, i8imm, ".b8", /* support_imm */ false>; - -defm StoreParamV2I64 : StoreParamV2Inst<B64, i64imm, ".b64">; -defm StoreParamV2I32 : StoreParamV2Inst<B32, i32imm, ".b32">; -defm StoreParamV2I16 : StoreParamV2Inst<B16, i16imm, ".b16">; -defm StoreParamV2I8 : StoreParamV2Inst<B16, i8imm, ".b8">; - -defm StoreParamV4I32 : StoreParamV4Inst<B32, i32imm, ".b32">; -defm StoreParamV4I16 : StoreParamV4Inst<B16, i16imm, ".b16">; -defm StoreParamV4I8 : StoreParamV4Inst<B16, i8imm, ".b8">; - -defm StoreParamF32 : StoreParamInst<B32, f32imm, ".b32">; -defm StoreParamF64 : StoreParamInst<B64, f64imm, ".b64">; - -defm StoreParamV2F32 : StoreParamV2Inst<B32, f32imm, ".b32">; -defm StoreParamV2F64 : StoreParamV2Inst<B64, f64imm, ".b64">; - -defm StoreParamV4F32 : StoreParamV4Inst<B32, f32imm, ".b32">; - def DECLARE_PARAM_array : NVPTXInst<(outs), (ins i32imm:$a, i32imm:$align, i32imm:$size), ".param .align $align .b8 \t$a[$size];", []>; diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 5e54b82..67cc01e 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -534,16 +534,26 @@ static DecodeStatus decodeRTZArg(MCInst &Inst, uint32_t Imm, int64_t Address, return MCDisassembler::Success; } -static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); - static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, uint64_t Address, - const MCDisassembler *Decoder); + const MCDisassembler *Decoder) { + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); + if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createImm(Imm)); + return MCDisassembler::Success; +} static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, + const MCDisassembler *Decoder) { + if (Imm < RISCVZC::RA_S0) + return MCDisassembler::Fail; + return decodeZcmpRlist(Inst, Imm, Address, Decoder); +} + +static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, + uint64_t Address, const MCDisassembler *Decoder); static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, @@ -592,24 +602,6 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, return S; } -static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); - if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) - return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(Imm)); - return MCDisassembler::Success; -} - -static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, - uint64_t Address, - const MCDisassembler *Decoder) { - if (Imm < RISCVZC::RA_S0) - return MCDisassembler::Fail; - return decodeZcmpRlist(Inst, Imm, Address, Decoder); -} - // Add implied SP operand for C.*SP compressed instructions. The SP operand // isn't explicitly encoded in the instruction. void RISCVDisassembler::addSPOperands(MCInst &MI) const { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 54845e5..607edd3 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const { } } +bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV( + EVT ScalarTy) const { + if (!ScalarTy.isSimple()) + return false; + switch (ScalarTy.getSimpleVT().SimpleTy) { + case MVT::iPTR: + return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::f16: + case MVT::bf16: + case MVT::f32: + return true; + case MVT::i64: + case MVT::f64: + return Subtarget.hasVInstructionsI64(); + default: + return false; + } +} unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const { return NumRepeatedDivisors; @@ -24239,7 +24260,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - if (!isLegalElementTypeForRVV(ScalarType)) + if (!isLegalLoadStoreElementTypeForRVV(ScalarType)) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index ca70c46..a788c0b7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -384,6 +384,7 @@ public: bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override; bool isLegalElementTypeForRVV(EVT ScalarTy) const; + bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const; bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 30d8f85..3cbe668 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType( if (!isTypeLegal(VT)) return false; - if (!isLegalElementTypeForRVV(VT.getScalarType()) || + if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) || !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, Alignment)) return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index d62d99c..f0510ec 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -265,7 +265,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalElementTypeForRVV(ElemType); + return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment, @@ -297,7 +297,7 @@ public: if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - return TLI->isLegalElementTypeForRVV(ElemType); + return TLI->isLegalLoadStoreElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) const override { diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index d13862f..143298b 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1540,6 +1540,8 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), + (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; } defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index ee6651c..6acb0bc 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -277,6 +277,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) { case PC: return "pc"; case SCEI: return "scei"; case SUSE: return "suse"; + case Meta: + return "meta"; } llvm_unreachable("Invalid VendorType!"); @@ -390,6 +392,8 @@ StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) { case OpenHOS: return "ohos"; case PAuthTest: return "pauthtest"; + case MTIA: + return "mtia"; case LLVM: return "llvm"; case Mlibc: @@ -677,6 +681,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) { .Case("suse", Triple::SUSE) .Case("oe", Triple::OpenEmbedded) .Case("intel", Triple::Intel) + .Case("meta", Triple::Meta) .Default(Triple::UnknownVendor); } @@ -780,6 +785,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) { .StartsWith("pauthtest", Triple::PAuthTest) .StartsWith("llvm", Triple::LLVM) .StartsWith("mlibc", Triple::Mlibc) + .StartsWith("mtia", Triple::MTIA) .Default(Triple::UnknownEnvironment); } diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp index b3910c4..d895cd7 100644 --- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp +++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp @@ -37,6 +37,16 @@ // memory that ends up in one of the runtime equivalents, since this can // happen if e.g. a library that was compiled without interposition returns // an allocation that can be validly passed to `free`. +// +// 3. MathFixup (required): Some accelerators might have an incomplete +// implementation for the intrinsics used to implement some of the math +// functions in <cmath> / their corresponding libcall lowerings. Since this +// can vary quite significantly between accelerators, we replace calls to a +// set of intrinsics / lib functions known to be problematic with calls to a +// HIPSTDPAR specific forwarding layer, which gives an uniform interface for +// accelerators to implement in their own runtime components. This pass +// should run before AcceleratorCodeSelection so as to prevent the spurious +// removal of the HIPSTDPAR specific forwarding functions. //===----------------------------------------------------------------------===// #include "llvm/Transforms/HipStdPar/HipStdPar.h" @@ -49,6 +59,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -519,3 +530,110 @@ HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { return PreservedAnalyses::none(); } + +static constexpr std::pair<StringLiteral, StringLiteral> MathLibToHipStdPar[]{ + {"acosh", "__hipstdpar_acosh_f64"}, + {"acoshf", "__hipstdpar_acosh_f32"}, + {"asinh", "__hipstdpar_asinh_f64"}, + {"asinhf", "__hipstdpar_asinh_f32"}, + {"atanh", "__hipstdpar_atanh_f64"}, + {"atanhf", "__hipstdpar_atanh_f32"}, + {"cbrt", "__hipstdpar_cbrt_f64"}, + {"cbrtf", "__hipstdpar_cbrt_f32"}, + {"erf", "__hipstdpar_erf_f64"}, + {"erff", "__hipstdpar_erf_f32"}, + {"erfc", "__hipstdpar_erfc_f64"}, + {"erfcf", "__hipstdpar_erfc_f32"}, + {"fdim", "__hipstdpar_fdim_f64"}, + {"fdimf", "__hipstdpar_fdim_f32"}, + {"expm1", "__hipstdpar_expm1_f64"}, + {"expm1f", "__hipstdpar_expm1_f32"}, + {"hypot", "__hipstdpar_hypot_f64"}, + {"hypotf", "__hipstdpar_hypot_f32"}, + {"ilogb", "__hipstdpar_ilogb_f64"}, + {"ilogbf", "__hipstdpar_ilogb_f32"}, + {"lgamma", "__hipstdpar_lgamma_f64"}, + {"lgammaf", "__hipstdpar_lgamma_f32"}, + {"log1p", "__hipstdpar_log1p_f64"}, + {"log1pf", "__hipstdpar_log1p_f32"}, + {"logb", "__hipstdpar_logb_f64"}, + {"logbf", "__hipstdpar_logb_f32"}, + {"nextafter", "__hipstdpar_nextafter_f64"}, + {"nextafterf", "__hipstdpar_nextafter_f32"}, + {"nexttoward", "__hipstdpar_nexttoward_f64"}, + {"nexttowardf", "__hipstdpar_nexttoward_f32"}, + {"remainder", "__hipstdpar_remainder_f64"}, + {"remainderf", "__hipstdpar_remainder_f32"}, + {"remquo", "__hipstdpar_remquo_f64"}, + {"remquof", "__hipstdpar_remquo_f32"}, + {"scalbln", "__hipstdpar_scalbln_f64"}, + {"scalblnf", "__hipstdpar_scalbln_f32"}, + {"scalbn", "__hipstdpar_scalbn_f64"}, + {"scalbnf", "__hipstdpar_scalbn_f32"}, + {"tgamma", "__hipstdpar_tgamma_f64"}, + {"tgammaf", "__hipstdpar_tgamma_f32"}}; + +PreservedAnalyses HipStdParMathFixupPass::run(Module &M, + ModuleAnalysisManager &) { + if (M.empty()) + return PreservedAnalyses::all(); + + SmallVector<std::pair<Function *, std::string>> ToReplace; + for (auto &&F : M) { + if (!F.hasName()) + continue; + + StringRef N = F.getName(); + Intrinsic::ID ID = F.getIntrinsicID(); + + switch (ID) { + case Intrinsic::not_intrinsic: { + auto It = + find_if(MathLibToHipStdPar, [&](auto &&M) { return M.first == N; }); + if (It == std::cend(MathLibToHipStdPar)) + continue; + ToReplace.emplace_back(&F, It->second); + break; + } + case Intrinsic::acos: + case Intrinsic::asin: + case Intrinsic::atan: + case Intrinsic::atan2: + case Intrinsic::cosh: + case Intrinsic::modf: + case Intrinsic::sinh: + case Intrinsic::tan: + case Intrinsic::tanh: + break; + default: { + if (F.getReturnType()->isDoubleTy()) { + switch (ID) { + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::pow: + case Intrinsic::sin: + break; + default: + continue; + } + break; + } + continue; + } + } + + ToReplace.emplace_back(&F, N); + llvm::replace(ToReplace.back().second, '.', '_'); + StringRef Prefix = "llvm"; + ToReplace.back().second.replace(0, Prefix.size(), "__hipstdpar"); + } + for (auto &&[F, NewF] : ToReplace) + F->replaceAllUsesWith( + M.getOrInsertFunction(NewF, F->getFunctionType()).getCallee()); + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 0164fcd..2b392fe 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -97,6 +97,8 @@ STATISTIC(MissingAllocForContextId, "Number of missing alloc nodes for context ids"); STATISTIC(SkippedCallsCloning, "Number of calls skipped during cloning due to unexpected operand"); +STATISTIC(MismatchedCloneAssignments, + "Number of callsites assigned to call multiple non-matching clones"); static cl::opt<std::string> DotFilePathPrefix( "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, @@ -2060,6 +2062,20 @@ static bool isMemProfClone(const Function &F) { return F.getName().contains(MemProfCloneSuffix); } +// Return the clone number of the given function by extracting it from the +// memprof suffix. Assumes the caller has already confirmed it is a memprof +// clone. +static unsigned getMemProfCloneNum(const Function &F) { + assert(isMemProfClone(F)); + auto Pos = F.getName().find_last_of('.'); + assert(Pos > 0); + unsigned CloneNo; + bool Err = F.getName().drop_front(Pos + 1).getAsInteger(10, CloneNo); + assert(!Err); + (void)Err; + return CloneNo; +} + std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, const Instruction *Call, unsigned CloneNo) const { @@ -3979,7 +3995,22 @@ IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { - if (CalleeFunc.cloneNo() > 0) + auto *CurF = cast<CallBase>(CallerCall.call())->getCalledFunction(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + if (isMemProfClone(*CurF)) { + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + auto CurCalleeCloneNo = getMemProfCloneNum(*CurF); + if (CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + } + if (NewCalleeCloneNo > 0) cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); OREGetter(CallerCall.call()->getFunction()) .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) @@ -3995,7 +4026,19 @@ void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, assert(CI && "Caller cannot be an allocation which should not have profiled calls"); assert(CI->Clones.size() > CallerCall.cloneNo()); - CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); + auto NewCalleeCloneNo = CalleeFunc.cloneNo(); + auto &CurCalleeCloneNo = CI->Clones[CallerCall.cloneNo()]; + // If we already assigned this callsite to call a specific non-default + // clone (i.e. not the original function which is clone 0), ensure that we + // aren't trying to now update it to call a different clone, which is + // indicative of a bug in the graph or function assignment. + if (CurCalleeCloneNo != 0 && CurCalleeCloneNo != NewCalleeCloneNo) { + LLVM_DEBUG(dbgs() << "Mismatch in call clone assignment: was " + << CurCalleeCloneNo << " now " << NewCalleeCloneNo + << "\n"); + MismatchedCloneAssignments++; + } + CurCalleeCloneNo = NewCalleeCloneNo; } // Update the debug information attached to NewFunc to use the clone Name. Note @@ -4457,14 +4500,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; }; + // Information for a single clone of this Func. + struct FuncCloneInfo { + // The function clone. + FuncInfo FuncClone; + // Remappings of each call of interest (from original uncloned call to the + // corresponding cloned call in this function clone). + std::map<CallInfo, CallInfo> CallMap; + }; + // Walk all functions for which we saw calls with memprof metadata, and handle // cloning for each of its calls. for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { FuncInfo OrigFunc(Func); - // Map from each clone of OrigFunc to a map of remappings of each call of - // interest (from original uncloned call to the corresponding cloned call in - // that function clone). - std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; + // Map from each clone number of OrigFunc to information about that function + // clone (the function clone FuncInfo and call remappings). The index into + // the vector is the clone number, as function clones are created and + // numbered sequentially. + std::vector<FuncCloneInfo> FuncCloneInfos; for (auto &Call : CallsWithMetadata) { ContextNode *Node = getNodeForInst(Call); // Skip call if we do not have a node for it (all uses of its stack ids @@ -4488,8 +4541,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // Record the clone of callsite node assigned to this function clone. FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; - assert(FuncClonesToCallMap.count(FuncClone)); - std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; + assert(FuncCloneInfos.size() > FuncClone.cloneNo()); + std::map<CallInfo, CallInfo> &CallMap = + FuncCloneInfos[FuncClone.cloneNo()].CallMap; CallInfo CallClone(Call); if (auto It = CallMap.find(Call); It != CallMap.end()) CallClone = It->second; @@ -4528,10 +4582,10 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // than existing function clones, which would have been assigned to an // earlier clone in the list (we assign callsite clones to function // clones greedily). - if (FuncClonesToCallMap.size() < NodeCloneCount) { + if (FuncCloneInfos.size() < NodeCloneCount) { // If this is the first callsite copy, assign to original function. if (NodeCloneCount == 1) { - // Since FuncClonesToCallMap is empty in this case, no clones have + // Since FuncCloneInfos is empty in this case, no clones have // been created for this function yet, and no callers should have // been assigned a function clone for this callee node yet. assert(llvm::none_of( @@ -4540,7 +4594,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { })); // Initialize with empty call map, assign Clone to original function // and its callers, and skip to the next clone. - FuncClonesToCallMap[OrigFunc] = {}; + FuncCloneInfos.push_back({OrigFunc, {}}); AssignCallsiteCloneToFuncClone( OrigFunc, Call, Clone, AllocationCallToContextNodeMap.count(Call)); @@ -4572,14 +4626,14 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { } // Clone function and save it along with the CallInfo map created - // during cloning in the FuncClonesToCallMap. + // during cloning in the FuncCloneInfos. std::map<CallInfo, CallInfo> NewCallMap; - unsigned CloneNo = FuncClonesToCallMap.size(); + unsigned CloneNo = FuncCloneInfos.size(); assert(CloneNo > 0 && "Clone 0 is the original function, which " "should already exist in the map"); FuncInfo NewFuncClone = cloneFunctionForCallsite( OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); - FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); + FuncCloneInfos.push_back({NewFuncClone, std::move(NewCallMap)}); FunctionClonesAnalysis++; Changed = true; @@ -4681,7 +4735,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { CallInfo OrigCall(Callee->getOrigNode()->Call); OrigCall.setCloneNo(0); std::map<CallInfo, CallInfo> &CallMap = - FuncClonesToCallMap[NewFuncClone]; + FuncCloneInfos[NewFuncClone.cloneNo()].CallMap; assert(CallMap.count(OrigCall)); CallInfo NewCall(CallMap[OrigCall]); assert(NewCall); @@ -4703,6 +4757,19 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // where the callers were assigned to different clones of a function. } + auto FindFirstAvailFuncClone = [&]() { + // Find first function in FuncCloneInfos without an assigned + // clone of this callsite Node. We should always have one + // available at this point due to the earlier cloning when the + // FuncCloneInfos size was smaller than the clone number. + for (auto &CF : FuncCloneInfos) { + if (!FuncCloneToCurNodeCloneMap.count(CF.FuncClone)) + return CF.FuncClone; + } + llvm_unreachable( + "Expected an available func clone for this callsite clone"); + }; + // See if we can use existing function clone. Walk through // all caller edges to see if any have already been assigned to // a clone of this callsite's function. If we can use it, do so. If not, @@ -4819,16 +4886,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // clone of OrigFunc for another caller during this iteration over // its caller edges. if (!FuncCloneAssignedToCurCallsiteClone) { - // Find first function in FuncClonesToCallMap without an assigned - // clone of this callsite Node. We should always have one - // available at this point due to the earlier cloning when the - // FuncClonesToCallMap size was smaller than the clone number. - for (auto &CF : FuncClonesToCallMap) { - if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { - FuncCloneAssignedToCurCallsiteClone = CF.first; - break; - } - } + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); assert(FuncCloneAssignedToCurCallsiteClone); // Assign Clone to FuncCloneAssignedToCurCallsiteClone AssignCallsiteCloneToFuncClone( @@ -4842,6 +4900,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { FuncCloneAssignedToCurCallsiteClone); } } + // If we didn't assign a function clone to this callsite clone yet, e.g. + // none of its callers has a non-null call, do the assignment here. + // We want to ensure that every callsite clone is assigned to some + // function clone, so that the call updates below work as expected. + // In particular if this is the original callsite, we want to ensure it + // is assigned to the original function, otherwise the original function + // will appear available for assignment to other callsite clones, + // leading to unintended effects. For one, the unknown and not updated + // callers will call into cloned paths leading to the wrong hints, + // because they still call the original function (clone 0). Also, + // because all callsites start out as being clone 0 by default, we can't + // easily distinguish between callsites explicitly assigned to clone 0 + // vs those never assigned, which can lead to multiple updates of the + // calls when invoking updateCall below, with mismatched clone values. + // TODO: Add a flag to the callsite nodes or some other mechanism to + // better distinguish and identify callsite clones that are not getting + // assigned to function clones as expected. + if (!FuncCloneAssignedToCurCallsiteClone) { + FuncCloneAssignedToCurCallsiteClone = FindFirstAvailFuncClone(); + assert(FuncCloneAssignedToCurCallsiteClone && + "No available func clone for this callsite clone"); + AssignCallsiteCloneToFuncClone( + FuncCloneAssignedToCurCallsiteClone, Call, Clone, + /*IsAlloc=*/AllocationCallToContextNodeMap.contains(Call)); + } } if (VerifyCCG) { checkNode<DerivedCCG, FuncTy, CallTy>(Node); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 033ef8b..a43a6ee 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -708,10 +708,14 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc, auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0)); if (Shuf && Shuf->hasOneUse() && match(Shuf->getOperand(1), m_Undef()) && all_equal(Shuf->getShuffleMask()) && - Shuf->getType() == Shuf->getOperand(0)->getType()) { + ElementCount::isKnownGE(Shuf->getType()->getElementCount(), + cast<VectorType>(Shuf->getOperand(0)->getType()) + ->getElementCount())) { // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Poison, SplatMask // trunc (shuf X, Poison, SplatMask) --> shuf (trunc X), Poison, SplatMask - Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType()); + Type *NewTruncTy = Shuf->getOperand(0)->getType()->getWithNewType( + Trunc.getType()->getScalarType()); + Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), NewTruncTy); return new ShuffleVectorInst(NarrowOp, Shuf->getShuffleMask()); } diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index f6780c0..ce1d9f1 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -456,7 +456,7 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, if (DisableMemOPOPT) return false; - if (F.hasFnAttribute(Attribute::OptimizeForSize)) + if (F.hasOptSize()) return false; MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); MemOPSizeOpt.perform(); diff --git a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp index 9fe655e..fca09c6 100644 --- a/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp +++ b/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp @@ -498,7 +498,7 @@ bool LibCallsShrinkWrap::perform(CallInst *CI) { static bool runImpl(Function &F, const TargetLibraryInfo &TLI, DominatorTree *DT) { - if (F.hasFnAttribute(Attribute::OptimizeForSize)) + if (F.hasOptSize()) return false; DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); LibCallsShrinkWrap CCDCE(TLI, DTU); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index ddb062b..571fa11 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1257,7 +1257,7 @@ Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { assert(Diff->getType()->isIntegerTy() && "difference must be of integer type"); Value *DiffV = expand(Diff); - Value *BaseV = &PN; + Value *BaseV = fixupLCSSAFormFor(&PN); if (PhiTy->isPointerTy()) { if (STy->isPointerTy()) return Builder.CreatePtrAdd(BaseV, DiffV); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6616e61f..7b7efb8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1363,11 +1363,15 @@ public: TTI.hasActiveVectorLength() && !EnableVPlanNativePath; if (EVLIsLegal) return; - // If for some reason EVL mode is unsupported, fallback to - // DataWithoutLaneMask to try to vectorize the loop with folded tail - // in a generic way. - ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask, - TailFoldingStyle::DataWithoutLaneMask}; + // If for some reason EVL mode is unsupported, fallback to a scalar epilogue + // if it's allowed, or DataWithoutLaneMask otherwise. + if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) + ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None}; + else + ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask, + TailFoldingStyle::DataWithoutLaneMask}; + LLVM_DEBUG( dbgs() << "LV: Preference for VP intrinsics indicated. Will " "not try to generate VP Intrinsics " @@ -4500,19 +4504,17 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; unsigned MaxTripCount = 0; - if (MainLoopVF.isFixed()) { - // TODO: extend to support scalable VFs. - const SCEV *TC = vputils::getSCEVExprForVPValue( - getPlanFor(MainLoopVF).getTripCount(), SE); - assert(!isa<SCEVCouldNotCompute>(TC) && - "Trip count SCEV must be computable"); - RemainingIterations = SE.getURemExpr( - TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); - - // No iterations left to process in the epilogue. - if (RemainingIterations->isZero()) - return Result; + const SCEV *TC = + vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE); + assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable"); + RemainingIterations = + SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC)); + + // No iterations left to process in the epilogue. + if (RemainingIterations->isZero()) + return Result; + if (MainLoopVF.isFixed()) { MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, SE.getConstant(TCType, MaxTripCount))) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 935a4e4..8de05c1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1094,6 +1094,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return Def->replaceAllUsesWith(A); + if (match(Def, m_c_Mul(m_VPValue(A), m_SpecificInt(0)))) + return Def->replaceAllUsesWith(R.getOperand(0) == A ? R.getOperand(1) + : R.getOperand(0)); + if (match(Def, m_Not(m_VPValue(A)))) { if (match(A, m_Not(m_VPValue(A)))) return Def->replaceAllUsesWith(A); diff --git a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll index 892277a..68c89c3 100644 --- a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll +++ b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll @@ -13,14 +13,14 @@ define void @fixed() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll new file mode 100644 index 0000000..c4a027c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll @@ -0,0 +1,162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -tail-dup-pred-size=2 -tail-dup-succ-size=2 -o - %s | FileCheck %s + +target triple = "arm64-apple-macosx13.0.0" + +@opcode.targets = local_unnamed_addr constant [6 x ptr] [ptr blockaddress(@test_interp, %op1.bb), ptr blockaddress(@test_interp, %op6.bb), ptr blockaddress(@test_interp, %loop.header), ptr blockaddress(@test_interp, %op2.bb), ptr blockaddress(@test_interp, %op4.bb), ptr blockaddress(@test_interp, %op5.bb)] + +define void @test_interp(ptr %frame, ptr %dst) { +; CHECK-LABEL: test_interp: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: stp x24, x23, [sp, #-64]! ; 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] ; 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] ; 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_offset w19, -24 +; CHECK-NEXT: .cfi_offset w20, -32 +; CHECK-NEXT: .cfi_offset w21, -40 +; CHECK-NEXT: .cfi_offset w22, -48 +; CHECK-NEXT: .cfi_offset w23, -56 +; CHECK-NEXT: .cfi_offset w24, -64 +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x21, _opcode.targets@PAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: add x21, x21, _opcode.targets@PAGEOFF +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: add x8, x21, xzr, lsl #3 +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: add x23, x22, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp0: ; Block address taken +; CHECK-NEXT: LBB0_1: ; %loop.header +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: mov x20, xzr +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp1: ; Block address taken +; CHECK-NEXT: LBB0_2: ; %op1.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str xzr, [x19] +; CHECK-NEXT: mov w8, #1 ; =0x1 +; CHECK-NEXT: ldr x0, [x20, #-8]! +; CHECK-NEXT: ldr x9, [x0, #8] +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ldr x8, [x9, #48] +; CHECK-NEXT: blr x8 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp2: ; Block address taken +; CHECK-NEXT: LBB0_3: ; %op2.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: mov x20, xzr +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: br x8 +; CHECK-NEXT: Ltmp3: ; Block address taken +; CHECK-NEXT: LBB0_4: ; %op4.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: add x10, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: ldur x8, [x22, #12] +; CHECK-NEXT: ldur x9, [x20, #-8] +; CHECK-NEXT: add x22, x22, #20 +; CHECK-NEXT: stp x8, x9, [x20, #-8] +; CHECK-NEXT: add x20, x20, #8 +; CHECK-NEXT: br x10 +; CHECK-NEXT: Ltmp4: ; Block address taken +; CHECK-NEXT: LBB0_5: ; %op5.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: str x22, [x19] +; CHECK-NEXT: add x10, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: ldur x8, [x22, #12] +; CHECK-NEXT: ldur x9, [x20, #-8] +; CHECK-NEXT: add x22, x22, #20 +; CHECK-NEXT: stp x8, x9, [x20, #-8] +; CHECK-NEXT: add x20, x20, #8 +; CHECK-NEXT: br x10 +; CHECK-NEXT: Ltmp5: ; Block address taken +; CHECK-NEXT: LBB0_6: ; %op6.bb +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr x0, [x20, #-8]! +; CHECK-NEXT: mov w8, #1 ; =0x1 +; CHECK-NEXT: ldr x9, [x0, #8] +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ldr x8, [x9, #48] +; CHECK-NEXT: blr x8 +; CHECK-NEXT: add x8, x21, x23, lsl #3 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: br x8 +; CHECK-NEXT: .loh AdrpAdd Lloh0, Lloh1 +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %op1.bb ], [ %iv.next, %op2.bb ], [ %iv.next, %op4.bb ], [ %iv.next, %op5.bb ], [ %iv.next, %op6.bb ], [ %iv.next, %loop.header ] + %stack.pointer = phi ptr [ %frame, %entry ], [ %stack.8, %op1.bb ], [ null, %op2.bb ], [ %stack.next, %op4.bb ], [ %stack.next.2, %op5.bb ], [ %stack.4, %op6.bb ], [ null, %loop.header ] + %next.instr = phi ptr [ null, %entry ], [ %next.instr, %op1.bb ], [ null, %op2.bb ], [ %next.instr.20, %op4.bb ], [ %next.instr.21, %op5.bb ], [ %next.instr, %op6.bb ], [ null, %loop.header ] + %iv.next = add i64 %iv, 1 + %next_op = getelementptr [6 x ptr], ptr @opcode.targets, i64 0, i64 %iv + indirectbr ptr %next_op, [label %op1.bb, label %op6.bb, label %loop.header, label %op2.bb, label %op4.bb, label %op5.bb] + +op1.bb: + store ptr null, ptr %dst, align 8 + %stack.8 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.0 = load ptr, ptr %stack.8, align 8 + store i64 1, ptr %l.0, align 8 + %gep.0 = getelementptr i8, ptr %l.0, i64 8 + %l.1 = load ptr, ptr %gep.0, align 8 + %gep.1 = getelementptr i8, ptr %l.1, i64 48 + %l.2 = load ptr, ptr %gep.1, align 8 + tail call void %l.2(ptr nonnull %l.0) + br label %loop.header + +op2.bb: + store ptr %next.instr, ptr %dst, align 8 + br label %loop.header + +op4.bb: + store ptr %next.instr, ptr %dst, align 8 + %next.instr.20 = getelementptr i8, ptr %next.instr, i64 20 + %stack.2 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.3 = load ptr, ptr %stack.2, align 8 + %next.instr.12 = getelementptr i8, ptr %next.instr, i64 12 + %next.instr.12.val = load ptr, ptr %next.instr.12, align 2 + store ptr %next.instr.12.val, ptr %stack.2, align 8 + store ptr %l.3, ptr %stack.pointer, align 8 + %stack.next = getelementptr i8, ptr %stack.pointer, i64 8 + br label %loop.header + +op5.bb: + store ptr %next.instr, ptr %dst, align 8 + %next.instr.21 = getelementptr i8, ptr %next.instr, i64 20 + %stack.3 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.4 = load ptr, ptr %stack.3, align 8 + %next.instr.2 = getelementptr i8, ptr %next.instr, i64 12 + %next.instr.2.val = load ptr, ptr %next.instr.2, align 2 + store ptr %next.instr.2.val, ptr %stack.3, align 8 + store ptr %l.4, ptr %stack.pointer, align 8 + %stack.next.2 = getelementptr i8, ptr %stack.pointer, i64 8 + br label %loop.header + +op6.bb: + %stack.4 = getelementptr i8, ptr %stack.pointer, i64 -8 + %l.5 = load ptr, ptr %stack.4, align 8 + store i64 1, ptr %l.5, align 8 + %gep.5 = getelementptr i8, ptr %l.5, i64 8 + %l.6 = load ptr, ptr %gep.5, align 8 + %gep.6 = getelementptr i8, ptr %l.6, i64 48 + %l.7 = load ptr, ptr %gep.6, align 8 + tail call void %l.7(ptr nonnull %l.5) + br label %loop.header +} diff --git a/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll b/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll index 05f4fb1..a6cb712 100644 --- a/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll +++ b/llvm/test/CodeGen/AArch64/preferred-function-alignment.ll @@ -40,3 +40,10 @@ define void @test_optsize() optsize { ; CHECK-LABEL: test_optsize ; CHECK-NEXT: .p2align 2 + +define void @test_minsize() minsize { + ret void +} + +; CHECK-LABEL: test_minsize +; CHECK-NEXT: .p2align 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir index 789385d..b770d43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-post-legalize.mir @@ -1,12 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s --- name: test_f32_add_mul @@ -24,15 +20,7 @@ body: | ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX9-DENORM-LABEL: name: test_f32_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -43,15 +31,7 @@ body: | ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-LABEL: name: test_f32_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -62,15 +42,7 @@ body: | ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-DENORM-LABEL: name: test_f32_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -81,15 +53,6 @@ body: | ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[COPY2]] ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -100,6 +63,60 @@ body: | ... --- +name: test_f32_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_f32_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-LABEL: name: test_f32_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %4:_(s32) = contract G_FMUL %0, %1 + %5:_(s32) = contract G_FADD %4, %2 + $vgpr0 = COPY %5(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + +--- name: test_f32_add_mul_rhs body: | bb.1.entry: @@ -115,15 +132,7 @@ body: | ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]] ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -134,15 +143,7 @@ body: | ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]] ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-LABEL: name: test_f32_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -153,15 +154,7 @@ body: | ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]] ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -172,15 +165,6 @@ body: | ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY2]], [[FMUL]] ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -191,6 +175,60 @@ body: | ... --- +name: test_f32_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %4:_(s32) = contract G_FMUL %0, %1 + %5:_(s32) = contract G_FADD %2, %4 + $vgpr0 = COPY %5(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + +--- name: test_add_mul_multiple_defs_z body: | bb.1.entry: @@ -209,18 +247,7 @@ body: | ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]] ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX9-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; ; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX9-DENORM-NEXT: {{ $}} @@ -234,18 +261,7 @@ body: | ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]] ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX9-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; ; GFX10-LABEL: name: test_add_mul_multiple_defs_z ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX10-NEXT: {{ $}} @@ -259,18 +275,7 @@ body: | ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]] ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX10-CONTRACT-LABEL: name: test_add_mul_multiple_defs_z - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; ; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX10-DENORM-NEXT: {{ $}} @@ -284,18 +289,6 @@ body: | ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL]], [[UV1]] ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX10-UNSAFE-LABEL: name: test_add_mul_multiple_defs_z - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %4:_(s32) = COPY $vgpr2 @@ -310,6 +303,76 @@ body: | ... --- +name: test_add_mul_multiple_defs_z_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; GFX9-LABEL: name: test_add_mul_multiple_defs_z_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; + ; GFX9-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; + ; GFX10-LABEL: name: test_add_mul_multiple_defs_z_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; + ; GFX10-DENORM-LABEL: name: test_add_mul_multiple_defs_z_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = COPY $vgpr2 + %5:_(s32) = COPY $vgpr3 + %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32) + %6:_(s32) = contract G_FMUL %0, %1 + %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1) + %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>) + %8:_(s32) = COPY %13(s32) + %10:_(s32) = contract G_FADD %6, %8 + $vgpr0 = COPY %10(s32) +... + +--- name: test_add_mul_rhs_multiple_defs_z body: | bb.1.entry: @@ -328,18 +391,7 @@ body: | ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]] ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX9-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; ; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX9-DENORM-NEXT: {{ $}} @@ -353,18 +405,7 @@ body: | ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]] ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX9-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; ; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX10-NEXT: {{ $}} @@ -378,18 +419,7 @@ body: | ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]] ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX10-CONTRACT-LABEL: name: test_add_mul_rhs_multiple_defs_z - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; ; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX10-DENORM-NEXT: {{ $}} @@ -403,18 +433,6 @@ body: | ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[FMUL]] ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) - ; GFX10-UNSAFE-LABEL: name: test_add_mul_rhs_multiple_defs_z - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %4:_(s32) = COPY $vgpr2 @@ -429,6 +447,76 @@ body: | ... --- +name: test_add_mul_rhs_multiple_defs_z_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; GFX9-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; + ; GFX9-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; + ; GFX10-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; + ; GFX10-DENORM-LABEL: name: test_add_mul_rhs_multiple_defs_z_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[MV]](p1) :: (load (<2 x s32>), addrspace 1) + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[UV1]] + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %4:_(s32) = COPY $vgpr2 + %5:_(s32) = COPY $vgpr3 + %2:_(p1) = G_MERGE_VALUES %4(s32), %5(s32) + %6:_(s32) = contract G_FMUL %0, %1 + %7:_(<2 x s32>) = G_LOAD %2(p1) :: (load (<2 x s32>), addrspace 1) + %12:_(s32), %13:_(s32) = G_UNMERGE_VALUES %7(<2 x s32>) + %8:_(s32) = COPY %13(s32) + %10:_(s32) = contract G_FADD %8, %6 + $vgpr0 = COPY %10(s32) +... + +--- name: test_half_add_mul body: | bb.1.entry: @@ -448,19 +536,7 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-CONTRACT-LABEL: name: test_half_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX9-DENORM-LABEL: name: test_half_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -475,19 +551,7 @@ body: | ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-UNSAFE-LABEL: name: test_half_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-LABEL: name: test_half_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -502,19 +566,7 @@ body: | ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-CONTRACT-LABEL: name: test_half_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-DENORM-LABEL: name: test_half_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -529,19 +581,6 @@ body: | ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-UNSAFE-LABEL: name: test_half_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %4:_(s32) = COPY $vgpr0 %0:_(s16) = G_TRUNC %4(s32) %5:_(s32) = COPY $vgpr1 @@ -556,6 +595,80 @@ body: | ... --- +name: test_half_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_half_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-LABEL: name: test_half_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %4:_(s32) = COPY $vgpr0 + %0:_(s16) = G_TRUNC %4(s32) + %5:_(s32) = COPY $vgpr1 + %1:_(s16) = G_TRUNC %5(s32) + %6:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %6(s32) + %7:_(s16) = contract G_FMUL %0, %1 + %8:_(s16) = contract G_FADD %7, %2 + %10:_(s32) = G_ANYEXT %8(s16) + $vgpr0 = COPY %10(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + +--- name: test_half_add_mul_rhs body: | bb.1.entry: @@ -575,19 +688,7 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -602,19 +703,7 @@ body: | ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-LABEL: name: test_half_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -629,19 +718,7 @@ body: | ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -656,19 +733,6 @@ body: | ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %4:_(s32) = COPY $vgpr0 %0:_(s16) = G_TRUNC %4(s32) %5:_(s32) = COPY $vgpr1 @@ -683,6 +747,80 @@ body: | ... --- +name: test_half_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_half_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-LABEL: name: test_half_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %4:_(s32) = COPY $vgpr0 + %0:_(s16) = G_TRUNC %4(s32) + %5:_(s32) = COPY $vgpr1 + %1:_(s16) = G_TRUNC %5(s32) + %6:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %6(s32) + %7:_(s16) = contract G_FMUL %0, %1 + %8:_(s16) = contract G_FADD %2, %7 + %10:_(s32) = G_ANYEXT %8(s16) + $vgpr0 = COPY %10(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + +--- name: test_double_add_mul body: | bb.1.entry: @@ -706,23 +844,7 @@ body: | ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX9-CONTRACT-LABEL: name: test_double_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX9-DENORM-LABEL: name: test_double_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -741,23 +863,7 @@ body: | ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX9-UNSAFE-LABEL: name: test_double_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX10-LABEL: name: test_double_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -776,23 +882,7 @@ body: | ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX10-CONTRACT-LABEL: name: test_double_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX10-DENORM-LABEL: name: test_double_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -811,23 +901,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX10-UNSAFE-LABEL: name: test_double_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) @@ -846,6 +919,101 @@ body: | ... --- +name: test_double_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_double_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_double_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %10:_(s64) = contract G_FMUL %0, %1 + %11:_(s64) = contract G_FADD %10, %2 + %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64) + $vgpr0 = COPY %13(s32) + $vgpr1 = COPY %14(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + + +--- name: test_double_add_mul_rhs body: | bb.1.entry: @@ -869,23 +1037,7 @@ body: | ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -904,23 +1056,7 @@ body: | ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX10-LABEL: name: test_double_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -939,23 +1075,7 @@ body: | ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -974,23 +1094,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) @@ -1009,6 +1112,100 @@ body: | ... --- +name: test_double_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_double_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_double_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %10:_(s64) = contract G_FMUL %0, %1 + %11:_(s64) = contract G_FADD %2, %10 + %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64) + $vgpr0 = COPY %13(s32) + $vgpr1 = COPY %14(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + +--- name: test_4xfloat_add_mul body: | bb.1.entry: @@ -1040,32 +1237,7 @@ body: | ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 ; GFX9-DENORM-NEXT: {{ $}} @@ -1092,32 +1264,7 @@ body: | ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; ; GFX10-LABEL: name: test_4xfloat_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 ; GFX10-NEXT: {{ $}} @@ -1144,32 +1291,7 @@ body: | ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 ; GFX10-DENORM-NEXT: {{ $}} @@ -1196,32 +1318,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -1248,6 +1344,144 @@ body: | ... --- +name: test_4xfloat_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + + ; GFX9-LABEL: name: test_4xfloat_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX10-LABEL: name: test_4xfloat_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s32>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s32>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32) + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32) + %12:_(s32) = COPY $vgpr8 + %13:_(s32) = COPY $vgpr9 + %14:_(s32) = COPY $vgpr10 + %15:_(s32) = COPY $vgpr11 + %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32) + %16:_(<4 x s32>) = contract G_FMUL %0, %1 + %17:_(<4 x s32>) = contract G_FADD %16, %2 + %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>) + $vgpr0 = COPY %19(s32) + $vgpr1 = COPY %20(s32) + $vgpr2 = COPY %21(s32) + $vgpr3 = COPY %22(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +... + +--- name: test_3xfloat_add_mul_rhs body: | bb.1.entry: @@ -1275,28 +1509,7 @@ body: | ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; GFX9-DENORM-NEXT: {{ $}} @@ -1319,28 +1532,7 @@ body: | ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; GFX10-NEXT: {{ $}} @@ -1363,28 +1555,7 @@ body: | ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; GFX10-DENORM-NEXT: {{ $}} @@ -1407,28 +1578,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -1451,6 +1600,124 @@ body: | ... --- +name: test_3xfloat_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + + ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; + ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; + ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; + ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32) + %7:_(s32) = COPY $vgpr3 + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32) + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %12:_(s32) = COPY $vgpr8 + %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32) + %13:_(<3 x s32>) = contract G_FMUL %0, %1 + %14:_(<3 x s32>) = contract G_FADD %2, %13 + %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>) + $vgpr0 = COPY %16(s32) + $vgpr1 = COPY %17(s32) + $vgpr2 = COPY %18(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 +... + +--- name: test_4xhalf_add_mul body: | bb.1.entry: @@ -1474,24 +1741,7 @@ body: | ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] - ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -1510,24 +1760,7 @@ body: | ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] - ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX10-LABEL: name: test_4xhalf_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -1546,24 +1779,7 @@ body: | ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] - ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -1582,24 +1798,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] - ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = G_FADD [[FMUL]], [[CONCAT_VECTORS2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(<2 x s16>) = COPY $vgpr0 %5:_(<2 x s16>) = COPY $vgpr1 %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>) @@ -1618,6 +1816,105 @@ body: | ... --- +name: test_4xhalf_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_4xhalf_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_4xhalf_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s16>) = contract G_FMUL [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s16>) = contract G_FADD [[FMUL]], [[CONCAT_VECTORS2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FADD]](<4 x s16>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(<2 x s16>) = COPY $vgpr0 + %5:_(<2 x s16>) = COPY $vgpr1 + %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>) + %6:_(<2 x s16>) = COPY $vgpr2 + %7:_(<2 x s16>) = COPY $vgpr3 + %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>) + %8:_(<2 x s16>) = COPY $vgpr4 + %9:_(<2 x s16>) = COPY $vgpr5 + %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>) + %10:_(<4 x s16>) = contract G_FMUL %0, %1 + %11:_(<4 x s16>) = contract G_FADD %10, %2 + %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>) + $vgpr0 = COPY %13(<2 x s16>) + $vgpr1 = COPY %14(<2 x s16>) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + + +--- name: test_3xhalf_add_mul_rhs body: | bb.1.entry: @@ -1648,31 +1945,6 @@ body: | ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]] - ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]] - ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -1698,31 +1970,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]] - ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]] - ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -1748,31 +1995,6 @@ body: | ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]] - ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]] - ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -1797,31 +2019,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; - ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = G_FMUL [[UV]], [[UV2]] - ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = G_FADD [[UV4]], [[FMUL]] - ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(<2 x s16>) = COPY $vgpr0 %5:_(<2 x s16>) = COPY $vgpr1 %10:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1846,6 +2043,134 @@ body: | ... --- +name: test_3xhalf_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]] + ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]] + ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]] + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s16>) = contract G_FMUL [[UV]], [[UV2]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s16>) = contract G_FADD [[UV4]], [[FMUL]] + ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FADD]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(<2 x s16>) = COPY $vgpr0 + %5:_(<2 x s16>) = COPY $vgpr1 + %10:_(<2 x s16>) = G_IMPLICIT_DEF + %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>) + %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>) + %6:_(<2 x s16>) = COPY $vgpr2 + %7:_(<2 x s16>) = COPY $vgpr3 + %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>) + %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>) + %8:_(<2 x s16>) = COPY $vgpr4 + %9:_(<2 x s16>) = COPY $vgpr5 + %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>) + %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>) + %17:_(<3 x s16>) = contract G_FMUL %0, %1 + %18:_(<3 x s16>) = contract G_FADD %2, %17 + %22:_(<3 x s16>) = G_IMPLICIT_DEF + %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>) + %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>) + $vgpr0 = COPY %20(<2 x s16>) + $vgpr1 = COPY %21(<2 x s16>) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + +--- name: test_4xdouble_add_mul body: | bb.1.entry: @@ -1905,60 +2230,7 @@ body: | ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 ; GFX9-DENORM-NEXT: {{ $}} @@ -2013,60 +2285,7 @@ body: | ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; ; GFX10-LABEL: name: test_4xdouble_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 ; GFX10-NEXT: {{ $}} @@ -2121,60 +2340,7 @@ body: | ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 ; GFX10-DENORM-NEXT: {{ $}} @@ -2229,60 +2395,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = G_FADD [[FMUL]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -2337,6 +2449,284 @@ body: | ... --- +name: test_4xdouble_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX9-LABEL: name: test_4xdouble_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX10-LABEL: name: test_4xdouble_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<4 x s64>) = contract G_FADD [[FMUL]], [[BUILD_VECTOR2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<4 x s64>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32) + %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64) + %12:_(s32) = COPY $vgpr8 + %13:_(s32) = COPY $vgpr9 + %14:_(s32) = COPY $vgpr10 + %15:_(s32) = COPY $vgpr11 + %16:_(s32) = COPY $vgpr12 + %17:_(s32) = COPY $vgpr13 + %18:_(s32) = COPY $vgpr14 + %19:_(s32) = COPY $vgpr15 + %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32) + %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32) + %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32) + %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32) + %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64) + %20:_(s32) = COPY $vgpr16 + %21:_(s32) = COPY $vgpr17 + %22:_(s32) = COPY $vgpr18 + %23:_(s32) = COPY $vgpr19 + %24:_(s32) = COPY $vgpr20 + %25:_(s32) = COPY $vgpr21 + %26:_(s32) = COPY $vgpr22 + %27:_(s32) = COPY $vgpr23 + %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32) + %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32) + %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32) + %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32) + %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64) + %40:_(<4 x s64>) = contract G_FMUL %0, %1 + %41:_(<4 x s64>) = contract G_FADD %40, %2 + %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>) + $vgpr0 = COPY %43(s32) + $vgpr1 = COPY %44(s32) + $vgpr2 = COPY %45(s32) + $vgpr3 = COPY %46(s32) + $vgpr4 = COPY %47(s32) + $vgpr5 = COPY %48(s32) + $vgpr6 = COPY %49(s32) + $vgpr7 = COPY %50(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 +... + +--- name: test_3xdouble_add_mul_rhs body: | bb.1.entry: @@ -2385,49 +2775,7 @@ body: | ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32) ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX9-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 ; GFX9-DENORM-NEXT: {{ $}} @@ -2471,49 +2819,7 @@ body: | ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX9-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 ; GFX10-NEXT: {{ $}} @@ -2557,49 +2863,7 @@ body: | ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32) ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX10-CONTRACT-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-CONTRACT-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 ; GFX10-DENORM-NEXT: {{ $}} @@ -2643,49 +2907,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX10-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] - ; GFX10-UNSAFE-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = G_FADD [[BUILD_VECTOR2]], [[FMUL]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -2727,3 +2948,226 @@ body: | $vgpr5 = COPY %39(s32) S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 ... + +--- +name: test_3xdouble_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + + ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; + ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; + ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; + ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s64>) = contract G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s64>) = contract G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s64>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64) + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %12:_(s32) = COPY $vgpr8 + %13:_(s32) = COPY $vgpr9 + %14:_(s32) = COPY $vgpr10 + %15:_(s32) = COPY $vgpr11 + %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32) + %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32) + %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32) + %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64) + %16:_(s32) = COPY $vgpr12 + %17:_(s32) = COPY $vgpr13 + %18:_(s32) = COPY $vgpr14 + %19:_(s32) = COPY $vgpr15 + %20:_(s32) = COPY $vgpr16 + %21:_(s32) = COPY $vgpr17 + %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32) + %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32) + %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32) + %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64) + %31:_(<3 x s64>) = contract G_FMUL %0, %1 + %32:_(<3 x s64>) = contract G_FADD %2, %31 + %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>) + $vgpr0 = COPY %34(s32) + $vgpr1 = COPY %35(s32) + $vgpr2 = COPY %36(s32) + $vgpr3 = COPY %37(s32) + $vgpr4 = COPY %38(s32) + $vgpr5 = COPY %39(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir index 42e53be..8f9fc67 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul-pre-legalize.mir @@ -1,12 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX9 %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX9-CONTRACT %s # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX9-DENORM %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX9-UNSAFE %s # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -fp-contract=fast %s -o - | FileCheck -check-prefix=GFX10-CONTRACT %s # RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner --denormal-fp-math=preserve-sign %s -o - | FileCheck -check-prefix=GFX10-DENORM %s -# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner -enable-unsafe-fp-math %s -o - | FileCheck -check-prefix=GFX10-UNSAFE %s --- name: test_f32_add_mul @@ -25,16 +21,6 @@ body: | ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX9-DENORM-LABEL: name: test_f32_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -46,16 +32,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-LABEL: name: test_f32_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -67,16 +43,6 @@ body: | ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-DENORM-LABEL: name: test_f32_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -87,16 +53,6 @@ body: | ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[FMUL]], [[COPY2]] ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; - ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -107,6 +63,60 @@ body: | ... --- +name: test_f32_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_f32_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_f32_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-LABEL: name: test_f32_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_f32_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %4:_(s32) = reassoc contract G_FMUL %0, %1 + %5:_(s32) = reassoc contract G_FADD %4, %2 + $vgpr0 = COPY %5(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + +--- name: test_f32_add_mul_rhs body: | bb.1.entry: @@ -123,16 +133,6 @@ body: | ; GFX9-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-CONTRACT-LABEL: name: test_f32_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -144,16 +144,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-UNSAFE-LABEL: name: test_f32_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-LABEL: name: test_f32_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -165,16 +155,6 @@ body: | ; GFX10-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX10-CONTRACT-LABEL: name: test_f32_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -185,16 +165,6 @@ body: | ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY2]], [[FMUL]] ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FADD]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; - ; GFX10-UNSAFE-LABEL: name: test_f32_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[FMA]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -205,6 +175,60 @@ body: | ... --- +name: test_f32_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_f32_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]] + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[FMA]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %4:_(s32) = reassoc contract G_FMUL %0, %1 + %5:_(s32) = reassoc contract G_FADD %2, %4 + $vgpr0 = COPY %5(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + +--- name: test_half_add_mul body: | bb.1.entry: @@ -225,20 +249,6 @@ body: | ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-CONTRACT-LABEL: name: test_half_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX9-DENORM-LABEL: name: test_half_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -254,20 +264,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-UNSAFE-LABEL: name: test_half_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-LABEL: name: test_half_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -283,20 +279,6 @@ body: | ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX10-CONTRACT-LABEL: name: test_half_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-DENORM-LABEL: name: test_half_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -311,20 +293,6 @@ body: | ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; - ; GFX10-UNSAFE-LABEL: name: test_half_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %4:_(s32) = COPY $vgpr0 %0:_(s16) = G_TRUNC %4(s32) %5:_(s32) = COPY $vgpr1 @@ -339,6 +307,81 @@ body: | ... --- +name: test_half_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_half_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_half_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-LABEL: name: test_half_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_half_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] + ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %4:_(s32) = COPY $vgpr0 + %0:_(s16) = G_TRUNC %4(s32) + %5:_(s32) = COPY $vgpr1 + %1:_(s16) = G_TRUNC %5(s32) + %6:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %6(s32) + %7:_(s16) = reassoc contract G_FMUL %0, %1 + %8:_(s16) = reassoc contract G_FADD %7, %2 + %10:_(s32) = G_ANYEXT %8(s16) + $vgpr0 = COPY %10(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + + +--- name: test_half_add_mul_rhs body: | bb.1.entry: @@ -359,20 +402,6 @@ body: | ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-CONTRACT-LABEL: name: test_half_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX9-DENORM-NEXT: {{ $}} @@ -388,20 +417,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX9-UNSAFE-LABEL: name: test_half_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-LABEL: name: test_half_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -417,20 +432,6 @@ body: | ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX10-CONTRACT-LABEL: name: test_half_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-CONTRACT-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 - ; ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX10-DENORM-NEXT: {{ $}} @@ -445,20 +446,84 @@ body: | ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + %4:_(s32) = COPY $vgpr0 + %0:_(s16) = G_TRUNC %4(s32) + %5:_(s32) = COPY $vgpr1 + %1:_(s16) = G_TRUNC %5(s32) + %6:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %6(s32) + %7:_(s16) = reassoc G_FMUL %0, %1 + %8:_(s16) = reassoc G_FADD %2, %7 + %10:_(s32) = G_ANYEXT %8(s16) + $vgpr0 = COPY %10(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 +... + +--- +name: test_half_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX9-LABEL: name: test_half_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]] + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX9-DENORM-LABEL: name: test_half_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]] + ; GFX9-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 ; - ; GFX10-UNSAFE-LABEL: name: test_half_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]] - ; GFX10-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; GFX10-LABEL: name: test_half_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]] + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 + ; + ; GFX10-DENORM-LABEL: name: test_half_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(s16) = reassoc G_FMUL [[TRUNC]], [[TRUNC1]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(s16) = reassoc G_FADD [[TRUNC2]], [[FMUL]] + ; GFX10-DENORM-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 %4:_(s32) = COPY $vgpr0 %0:_(s16) = G_TRUNC %4(s32) %5:_(s32) = COPY $vgpr1 @@ -497,24 +562,6 @@ body: | ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-CONTRACT-LABEL: name: test_double_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX9-DENORM-LABEL: name: test_double_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -534,24 +581,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-UNSAFE-LABEL: name: test_double_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-LABEL: name: test_double_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -571,24 +600,6 @@ body: | ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX10-CONTRACT-LABEL: name: test_double_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-DENORM-LABEL: name: test_double_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -607,24 +618,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; - ; GFX10-UNSAFE-LABEL: name: test_double_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) @@ -643,6 +636,100 @@ body: | ... --- +name: test_double_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_double_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_double_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_double_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_double_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %10:_(s64) = reassoc contract G_FMUL %0, %1 + %11:_(s64) = reassoc contract G_FADD %10, %2 + %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64) + $vgpr0 = COPY %13(s32) + $vgpr1 = COPY %14(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + +--- name: test_double_add_mul_rhs body: | bb.1.entry: @@ -667,24 +754,6 @@ body: | ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-CONTRACT-LABEL: name: test_double_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -704,24 +773,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-UNSAFE-LABEL: name: test_double_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-LABEL: name: test_double_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -741,24 +792,6 @@ body: | ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX10-CONTRACT-LABEL: name: test_double_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -777,24 +810,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; - ; GFX10-UNSAFE-LABEL: name: test_double_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) @@ -813,6 +828,100 @@ body: | ... --- +name: test_double_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_double_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_double_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_double_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_double_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[MV]], [[MV1]], [[MV2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](s64) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %0:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %1:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %2:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %10:_(s64) = reassoc contract G_FMUL %0, %1 + %11:_(s64) = reassoc contract G_FADD %2, %10 + %13:_(s32), %14:_(s32) = G_UNMERGE_VALUES %11(s64) + $vgpr0 = COPY %13(s32) + $vgpr1 = COPY %14(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + +--- name: test_4xfloat_add_mul body: | bb.1.entry: @@ -845,32 +954,6 @@ body: | ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; - ; GFX9-CONTRACT-LABEL: name: test_4xfloat_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 ; GFX9-DENORM-NEXT: {{ $}} @@ -898,32 +981,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; - ; GFX9-UNSAFE-LABEL: name: test_4xfloat_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; ; GFX10-LABEL: name: test_4xfloat_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 ; GFX10-NEXT: {{ $}} @@ -951,32 +1008,6 @@ body: | ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 ; - ; GFX10-CONTRACT-LABEL: name: test_4xfloat_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 ; GFX10-DENORM-NEXT: {{ $}} @@ -1003,32 +1034,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 - ; - ; GFX10-UNSAFE-LABEL: name: test_4xfloat_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -1055,6 +1060,140 @@ body: | ... --- +name: test_4xfloat_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + + ; GFX9-LABEL: name: test_4xfloat_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX9-DENORM-LABEL: name: test_4xfloat_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX10-LABEL: name: test_4xfloat_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + ; + ; GFX10-DENORM-LABEL: name: test_4xfloat_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s32>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %0:_(<4 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32), %7(s32) + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %1:_(<4 x s32>) = G_BUILD_VECTOR %8(s32), %9(s32), %10(s32), %11(s32) + %12:_(s32) = COPY $vgpr8 + %13:_(s32) = COPY $vgpr9 + %14:_(s32) = COPY $vgpr10 + %15:_(s32) = COPY $vgpr11 + %2:_(<4 x s32>) = G_BUILD_VECTOR %12(s32), %13(s32), %14(s32), %15(s32) + %16:_(<4 x s32>) = reassoc contract G_FMUL %0, %1 + %17:_(<4 x s32>) = reassoc contract G_FADD %16, %2 + %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32) = G_UNMERGE_VALUES %17(<4 x s32>) + $vgpr0 = COPY %19(s32) + $vgpr1 = COPY %20(s32) + $vgpr2 = COPY %21(s32) + $vgpr3 = COPY %22(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 +... + +--- name: test_3xfloat_add_mul_rhs body: | bb.1.entry: @@ -1083,28 +1222,6 @@ body: | ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; - ; GFX9-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - ; ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; GFX9-DENORM-NEXT: {{ $}} @@ -1128,28 +1245,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; - ; GFX9-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - ; ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; GFX10-NEXT: {{ $}} @@ -1173,28 +1268,6 @@ body: | ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; - ; GFX10-CONTRACT-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 - ; ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 ; GFX10-DENORM-NEXT: {{ $}} @@ -1217,28 +1290,124 @@ body: | ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %0:_(<3 x s32>) = G_BUILD_VECTOR %4(s32), %5(s32), %6(s32) + %7:_(s32) = COPY $vgpr3 + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %1:_(<3 x s32>) = G_BUILD_VECTOR %7(s32), %8(s32), %9(s32) + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %12:_(s32) = COPY $vgpr8 + %2:_(<3 x s32>) = G_BUILD_VECTOR %10(s32), %11(s32), %12(s32) + %13:_(<3 x s32>) = reassoc G_FMUL %0, %1 + %14:_(<3 x s32>) = reassoc G_FADD %2, %13 + %16:_(s32), %17:_(s32), %18:_(s32) = G_UNMERGE_VALUES %14(<3 x s32>) + $vgpr0 = COPY %16(s32) + $vgpr1 = COPY %17(s32) + $vgpr2 = COPY %18(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 +... + +--- +name: test_3xfloat_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + + ; GFX9-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; + ; GFX9-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX9-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX9-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; + ; GFX10-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 ; - ; GFX10-UNSAFE-LABEL: name: test_3xfloat_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s32>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s32>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 + ; GFX10-DENORM-LABEL: name: test_3xfloat_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32) + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32) + ; GFX10-DENORM-NEXT: [[FMUL:%[0-9]+]]:_(<3 x s32>) = reassoc G_FMUL [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; GFX10-DENORM-NEXT: [[FADD:%[0-9]+]]:_(<3 x s32>) = reassoc G_FADD [[BUILD_VECTOR2]], [[FMUL]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FADD]](<3 x s32>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -1285,24 +1454,6 @@ body: | ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-CONTRACT-LABEL: name: test_4xhalf_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -1322,24 +1473,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-UNSAFE-LABEL: name: test_4xhalf_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-LABEL: name: test_4xhalf_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -1359,24 +1492,6 @@ body: | ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX10-CONTRACT-LABEL: name: test_4xhalf_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -1395,24 +1510,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; - ; GFX10-UNSAFE-LABEL: name: test_4xhalf_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(<2 x s16>) = COPY $vgpr0 %5:_(<2 x s16>) = COPY $vgpr1 %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>) @@ -1431,6 +1528,100 @@ body: | ... --- +name: test_4xhalf_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_4xhalf_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_4xhalf_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_4xhalf_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_4xhalf_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s16>) = G_FMA [[CONCAT_VECTORS]], [[CONCAT_VECTORS1]], [[CONCAT_VECTORS2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[FMA]](<4 x s16>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(<2 x s16>) = COPY $vgpr0 + %5:_(<2 x s16>) = COPY $vgpr1 + %0:_(<4 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>) + %6:_(<2 x s16>) = COPY $vgpr2 + %7:_(<2 x s16>) = COPY $vgpr3 + %1:_(<4 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>) + %8:_(<2 x s16>) = COPY $vgpr4 + %9:_(<2 x s16>) = COPY $vgpr5 + %2:_(<4 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>) + %10:_(<4 x s16>) = reassoc contract G_FMUL %0, %1 + %11:_(<4 x s16>) = reassoc contract G_FADD %10, %2 + %13:_(<2 x s16>), %14:_(<2 x s16>) = G_UNMERGE_VALUES %11(<4 x s16>) + $vgpr0 = COPY %13(<2 x s16>) + $vgpr1 = COPY %14(<2 x s16>) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + +--- name: test_3xhalf_add_mul_rhs body: | bb.1.entry: @@ -1461,30 +1652,6 @@ body: | ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] - ; GFX9-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX9-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX9-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX9-DENORM-NEXT: {{ $}} @@ -1510,30 +1677,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX9-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] - ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX9-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-NEXT: {{ $}} @@ -1559,30 +1702,6 @@ body: | ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 ; - ; GFX10-CONTRACT-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] - ; GFX10-CONTRACT-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX10-CONTRACT-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX10-CONTRACT-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GFX10-DENORM-NEXT: {{ $}} @@ -1607,30 +1726,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 - ; - ; GFX10-UNSAFE-LABEL: name: test_3xhalf_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] - ; GFX10-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF - ; GFX10-UNSAFE-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) - ; GFX10-UNSAFE-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 %4:_(<2 x s16>) = COPY $vgpr0 %5:_(<2 x s16>) = COPY $vgpr1 %10:_(<2 x s16>) = G_IMPLICIT_DEF @@ -1655,6 +1750,130 @@ body: | ... --- +name: test_3xhalf_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + + ; GFX9-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] + ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX9-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX9-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX9-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] + ; GFX9-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX9-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX9-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX10-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX10-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + ; + ; GFX10-DENORM-LABEL: name: test_3xhalf_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[UV2:%[0-9]+]]:_(<3 x s16>), [[UV3:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY4]](<2 x s16>), [[COPY5]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10-DENORM-NEXT: [[UV4:%[0-9]+]]:_(<3 x s16>), [[UV5:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS2]](<6 x s16>) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s16>) = G_FMA [[UV]], [[UV2]], [[UV4]] + ; GFX10-DENORM-NEXT: [[DEF1:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF + ; GFX10-DENORM-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[FMA]](<3 x s16>), [[DEF1]](<3 x s16>) + ; GFX10-DENORM-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS3]](<6 x s16>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV6]](<2 x s16>) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV7]](<2 x s16>) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 + %4:_(<2 x s16>) = COPY $vgpr0 + %5:_(<2 x s16>) = COPY $vgpr1 + %10:_(<2 x s16>) = G_IMPLICIT_DEF + %11:_(<6 x s16>) = G_CONCAT_VECTORS %4(<2 x s16>), %5(<2 x s16>), %10(<2 x s16>) + %0:_(<3 x s16>), %12:_(<3 x s16>) = G_UNMERGE_VALUES %11(<6 x s16>) + %6:_(<2 x s16>) = COPY $vgpr2 + %7:_(<2 x s16>) = COPY $vgpr3 + %13:_(<6 x s16>) = G_CONCAT_VECTORS %6(<2 x s16>), %7(<2 x s16>), %10(<2 x s16>) + %1:_(<3 x s16>), %14:_(<3 x s16>) = G_UNMERGE_VALUES %13(<6 x s16>) + %8:_(<2 x s16>) = COPY $vgpr4 + %9:_(<2 x s16>) = COPY $vgpr5 + %15:_(<6 x s16>) = G_CONCAT_VECTORS %8(<2 x s16>), %9(<2 x s16>), %10(<2 x s16>) + %2:_(<3 x s16>), %16:_(<3 x s16>) = G_UNMERGE_VALUES %15(<6 x s16>) + %17:_(<3 x s16>) = reassoc contract G_FMUL %0, %1 + %18:_(<3 x s16>) = reassoc contract G_FADD %2, %17 + %22:_(<3 x s16>) = G_IMPLICIT_DEF + %23:_(<6 x s16>) = G_CONCAT_VECTORS %18(<3 x s16>), %22(<3 x s16>) + %20:_(<2 x s16>), %21:_(<2 x s16>), %24:_(<2 x s16>) = G_UNMERGE_VALUES %23(<6 x s16>) + $vgpr0 = COPY %20(<2 x s16>) + $vgpr1 = COPY %21(<2 x s16>) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1 +... + +--- name: test_4xdouble_add_mul body: | bb.1.entry: @@ -1715,60 +1934,6 @@ body: | ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; - ; GFX9-CONTRACT-LABEL: name: test_4xdouble_add_mul - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX9-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX9-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX9-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX9-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX9-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX9-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX9-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 ; GFX9-DENORM-NEXT: {{ $}} @@ -1824,60 +1989,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; - ; GFX9-UNSAFE-LABEL: name: test_4xdouble_add_mul - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX9-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX9-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX9-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX9-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX9-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX9-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX9-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; ; GFX10-LABEL: name: test_4xdouble_add_mul ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 ; GFX10-NEXT: {{ $}} @@ -1933,60 +2044,6 @@ body: | ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; - ; GFX10-CONTRACT-LABEL: name: test_4xdouble_add_mul - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-CONTRACT-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX10-CONTRACT-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX10-CONTRACT-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX10-CONTRACT-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX10-CONTRACT-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX10-CONTRACT-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-CONTRACT-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX10-CONTRACT-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX10-CONTRACT-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 ; GFX10-DENORM-NEXT: {{ $}} @@ -2041,60 +2098,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32) ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 - ; - ; GFX10-UNSAFE-LABEL: name: test_4xdouble_add_mul - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-UNSAFE-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 - ; GFX10-UNSAFE-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 - ; GFX10-UNSAFE-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 - ; GFX10-UNSAFE-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 - ; GFX10-UNSAFE-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 - ; GFX10-UNSAFE-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 - ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-UNSAFE-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) - ; GFX10-UNSAFE-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) - ; GFX10-UNSAFE-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr6 = COPY [[UV6]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr7 = COPY [[UV7]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -2149,6 +2152,280 @@ body: | ... --- +name: test_4xdouble_add_mul_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + + ; GFX9-LABEL: name: test_4xdouble_add_mul_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX9-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX9-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX9-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX9-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX9-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX9-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX9-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX9-DENORM-LABEL: name: test_4xdouble_add_mul_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX9-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX9-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX9-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX9-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX9-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX9-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX9-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX9-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX10-LABEL: name: test_4xdouble_add_mul_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX10-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX10-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX10-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX10-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX10-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX10-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX10-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX10-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + ; + ; GFX10-DENORM-LABEL: name: test_4xdouble_add_mul_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64) + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV4]](s64), [[MV5]](s64), [[MV6]](s64), [[MV7]](s64) + ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-DENORM-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GFX10-DENORM-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GFX10-DENORM-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GFX10-DENORM-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GFX10-DENORM-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GFX10-DENORM-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-DENORM-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY18]](s32), [[COPY19]](s32) + ; GFX10-DENORM-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY20]](s32), [[COPY21]](s32) + ; GFX10-DENORM-NEXT: [[MV11:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY22]](s32), [[COPY23]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV8]](s64), [[MV9]](s64), [[MV10]](s64), [[MV11]](s64) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<4 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<4 x s64>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-DENORM-NEXT: $vgpr6 = COPY [[UV6]](s32) + ; GFX10-DENORM-NEXT: $vgpr7 = COPY [[UV7]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %28:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %29:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %30:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %31:_(s64) = G_MERGE_VALUES %10(s32), %11(s32) + %0:_(<4 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64), %31(s64) + %12:_(s32) = COPY $vgpr8 + %13:_(s32) = COPY $vgpr9 + %14:_(s32) = COPY $vgpr10 + %15:_(s32) = COPY $vgpr11 + %16:_(s32) = COPY $vgpr12 + %17:_(s32) = COPY $vgpr13 + %18:_(s32) = COPY $vgpr14 + %19:_(s32) = COPY $vgpr15 + %32:_(s64) = G_MERGE_VALUES %12(s32), %13(s32) + %33:_(s64) = G_MERGE_VALUES %14(s32), %15(s32) + %34:_(s64) = G_MERGE_VALUES %16(s32), %17(s32) + %35:_(s64) = G_MERGE_VALUES %18(s32), %19(s32) + %1:_(<4 x s64>) = G_BUILD_VECTOR %32(s64), %33(s64), %34(s64), %35(s64) + %20:_(s32) = COPY $vgpr16 + %21:_(s32) = COPY $vgpr17 + %22:_(s32) = COPY $vgpr18 + %23:_(s32) = COPY $vgpr19 + %24:_(s32) = COPY $vgpr20 + %25:_(s32) = COPY $vgpr21 + %26:_(s32) = COPY $vgpr22 + %27:_(s32) = COPY $vgpr23 + %36:_(s64) = G_MERGE_VALUES %20(s32), %21(s32) + %37:_(s64) = G_MERGE_VALUES %22(s32), %23(s32) + %38:_(s64) = G_MERGE_VALUES %24(s32), %25(s32) + %39:_(s64) = G_MERGE_VALUES %26(s32), %27(s32) + %2:_(<4 x s64>) = G_BUILD_VECTOR %36(s64), %37(s64), %38(s64), %39(s64) + %40:_(<4 x s64>) = reassoc contract G_FMUL %0, %1 + %41:_(<4 x s64>) = reassoc contract G_FADD %40, %2 + %43:_(s32), %44:_(s32), %45:_(s32), %46:_(s32), %47:_(s32), %48:_(s32), %49:_(s32), %50:_(s32) = G_UNMERGE_VALUES %41(<4 x s64>) + $vgpr0 = COPY %43(s32) + $vgpr1 = COPY %44(s32) + $vgpr2 = COPY %45(s32) + $vgpr3 = COPY %46(s32) + $vgpr4 = COPY %47(s32) + $vgpr5 = COPY %48(s32) + $vgpr6 = COPY %49(s32) + $vgpr7 = COPY %50(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 +... + +--- name: test_3xdouble_add_mul_rhs body: | bb.1.entry: @@ -2198,49 +2475,6 @@ body: | ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 ; - ; GFX9-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX9-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX9-CONTRACT-NEXT: {{ $}} - ; GFX9-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX9-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX9-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) - ; GFX9-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 ; GFX9-DENORM-NEXT: {{ $}} @@ -2285,49 +2519,6 @@ body: | ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 ; - ; GFX9-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX9-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX9-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX9-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX9-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX9-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX9-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX9-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX9-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX9-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX9-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX9-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX9-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX9-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX9-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX9-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX9-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX9-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX9-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX9-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX9-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX9-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX9-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 ; GFX10-NEXT: {{ $}} @@ -2372,49 +2563,6 @@ body: | ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 ; - ; GFX10-CONTRACT-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX10-CONTRACT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX10-CONTRACT-NEXT: {{ $}} - ; GFX10-CONTRACT-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-CONTRACT-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-CONTRACT-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-CONTRACT-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-CONTRACT-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-CONTRACT-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-CONTRACT-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-CONTRACT-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-CONTRACT-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-CONTRACT-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-CONTRACT-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-CONTRACT-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-CONTRACT-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-CONTRACT-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-CONTRACT-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-CONTRACT-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-CONTRACT-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX10-CONTRACT-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-CONTRACT-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-CONTRACT-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-CONTRACT-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-CONTRACT-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-CONTRACT-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-CONTRACT-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-CONTRACT-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-CONTRACT-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-CONTRACT-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX10-CONTRACT-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-CONTRACT-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) - ; GFX10-CONTRACT-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-CONTRACT-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-CONTRACT-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 ; GFX10-DENORM-NEXT: {{ $}} @@ -2458,49 +2606,6 @@ body: | ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 - ; - ; GFX10-UNSAFE-LABEL: name: test_3xdouble_add_mul_rhs - ; GFX10-UNSAFE: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 - ; GFX10-UNSAFE-NEXT: {{ $}} - ; GFX10-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX10-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX10-UNSAFE-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 - ; GFX10-UNSAFE-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 - ; GFX10-UNSAFE-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 - ; GFX10-UNSAFE-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 - ; GFX10-UNSAFE-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) - ; GFX10-UNSAFE-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) - ; GFX10-UNSAFE-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 - ; GFX10-UNSAFE-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 - ; GFX10-UNSAFE-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 - ; GFX10-UNSAFE-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 - ; GFX10-UNSAFE-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 - ; GFX10-UNSAFE-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 - ; GFX10-UNSAFE-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) - ; GFX10-UNSAFE-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) - ; GFX10-UNSAFE-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) - ; GFX10-UNSAFE-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 - ; GFX10-UNSAFE-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 - ; GFX10-UNSAFE-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 - ; GFX10-UNSAFE-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 - ; GFX10-UNSAFE-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 - ; GFX10-UNSAFE-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 - ; GFX10-UNSAFE-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) - ; GFX10-UNSAFE-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) - ; GFX10-UNSAFE-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) - ; GFX10-UNSAFE-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) - ; GFX10-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] - ; GFX10-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) - ; GFX10-UNSAFE-NEXT: $vgpr0 = COPY [[UV]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr1 = COPY [[UV1]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr2 = COPY [[UV2]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr3 = COPY [[UV3]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr4 = COPY [[UV4]](s32) - ; GFX10-UNSAFE-NEXT: $vgpr5 = COPY [[UV5]](s32) - ; GFX10-UNSAFE-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 %4:_(s32) = COPY $vgpr0 %5:_(s32) = COPY $vgpr1 %6:_(s32) = COPY $vgpr2 @@ -2542,3 +2647,222 @@ body: | $vgpr5 = COPY %39(s32) S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 ... + +--- +name: test_3xdouble_add_mul_rhs_contract +body: | + bb.1.entry: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + + ; GFX9-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX9-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX9-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) + ; GFX9-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; + ; GFX9-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX9-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX9-DENORM-NEXT: {{ $}} + ; GFX9-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX9-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX9-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX9-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX9-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX9-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX9-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX9-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX9-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX9-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX9-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX9-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX9-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX9-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX9-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX9-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX9-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX9-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX9-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX9-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX9-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX9-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX9-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX9-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX9-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX9-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX9-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) + ; GFX9-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX9-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX9-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX9-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX9-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX9-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX9-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; + ; GFX10-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX10-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) + ; GFX10-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + ; + ; GFX10-DENORM-LABEL: name: test_3xdouble_add_mul_rhs_contract + ; GFX10-DENORM: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17 + ; GFX10-DENORM-NEXT: {{ $}} + ; GFX10-DENORM-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-DENORM-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-DENORM-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10-DENORM-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GFX10-DENORM-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GFX10-DENORM-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GFX10-DENORM-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; GFX10-DENORM-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; GFX10-DENORM-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64) + ; GFX10-DENORM-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GFX10-DENORM-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GFX10-DENORM-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GFX10-DENORM-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GFX10-DENORM-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GFX10-DENORM-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GFX10-DENORM-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32) + ; GFX10-DENORM-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY8]](s32), [[COPY9]](s32) + ; GFX10-DENORM-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV3]](s64), [[MV4]](s64), [[MV5]](s64) + ; GFX10-DENORM-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GFX10-DENORM-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GFX10-DENORM-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GFX10-DENORM-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GFX10-DENORM-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GFX10-DENORM-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GFX10-DENORM-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY12]](s32), [[COPY13]](s32) + ; GFX10-DENORM-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY14]](s32), [[COPY15]](s32) + ; GFX10-DENORM-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY16]](s32), [[COPY17]](s32) + ; GFX10-DENORM-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[MV6]](s64), [[MV7]](s64), [[MV8]](s64) + ; GFX10-DENORM-NEXT: [[FMA:%[0-9]+]]:_(<3 x s64>) = G_FMA [[BUILD_VECTOR]], [[BUILD_VECTOR1]], [[BUILD_VECTOR2]] + ; GFX10-DENORM-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMA]](<3 x s64>) + ; GFX10-DENORM-NEXT: $vgpr0 = COPY [[UV]](s32) + ; GFX10-DENORM-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; GFX10-DENORM-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; GFX10-DENORM-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; GFX10-DENORM-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; GFX10-DENORM-NEXT: $vgpr5 = COPY [[UV5]](s32) + ; GFX10-DENORM-NEXT: S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 + %4:_(s32) = COPY $vgpr0 + %5:_(s32) = COPY $vgpr1 + %6:_(s32) = COPY $vgpr2 + %7:_(s32) = COPY $vgpr3 + %8:_(s32) = COPY $vgpr4 + %9:_(s32) = COPY $vgpr5 + %22:_(s64) = G_MERGE_VALUES %4(s32), %5(s32) + %23:_(s64) = G_MERGE_VALUES %6(s32), %7(s32) + %24:_(s64) = G_MERGE_VALUES %8(s32), %9(s32) + %0:_(<3 x s64>) = G_BUILD_VECTOR %22(s64), %23(s64), %24(s64) + %10:_(s32) = COPY $vgpr6 + %11:_(s32) = COPY $vgpr7 + %12:_(s32) = COPY $vgpr8 + %13:_(s32) = COPY $vgpr9 + %14:_(s32) = COPY $vgpr10 + %15:_(s32) = COPY $vgpr11 + %25:_(s64) = G_MERGE_VALUES %10(s32), %11(s32) + %26:_(s64) = G_MERGE_VALUES %12(s32), %13(s32) + %27:_(s64) = G_MERGE_VALUES %14(s32), %15(s32) + %1:_(<3 x s64>) = G_BUILD_VECTOR %25(s64), %26(s64), %27(s64) + %16:_(s32) = COPY $vgpr12 + %17:_(s32) = COPY $vgpr13 + %18:_(s32) = COPY $vgpr14 + %19:_(s32) = COPY $vgpr15 + %20:_(s32) = COPY $vgpr16 + %21:_(s32) = COPY $vgpr17 + %28:_(s64) = G_MERGE_VALUES %16(s32), %17(s32) + %29:_(s64) = G_MERGE_VALUES %18(s32), %19(s32) + %30:_(s64) = G_MERGE_VALUES %20(s32), %21(s32) + %2:_(<3 x s64>) = G_BUILD_VECTOR %28(s64), %29(s64), %30(s64) + %31:_(<3 x s64>) = reassoc contract G_FMUL %0, %1 + %32:_(<3 x s64>) = reassoc contract G_FADD %2, %31 + %34:_(s32), %35:_(s32), %36:_(s32), %37:_(s32), %38:_(s32), %39:_(s32) = G_UNMERGE_VALUES %32(<3 x s64>) + $vgpr0 = COPY %34(s32) + $vgpr1 = COPY %35(s32) + $vgpr2 = COPY %36(s32) + $vgpr3 = COPY %37(s32) + $vgpr4 = COPY %38(s32) + $vgpr5 = COPY %39(s32) + S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll index 24dd535..3f6e3d8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -2,11 +2,9 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX9-UNSAFE %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GFX10-UNSAFE %s define float @test_f32_add_mul(float %x, float %y, float %z) { ; GFX9-LABEL: test_f32_add_mul: @@ -28,12 +26,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) { ; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_f32_add_mul: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_f32_add_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -52,7 +44,6 @@ define float @test_f32_add_mul(float %x, float %y, float %z) { ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-UNSAFE-LABEL: test_f32_add_mul: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -64,6 +55,58 @@ define float @test_f32_add_mul(float %x, float %y, float %z) { ret float %b } +define float @test_f32_add_mul_contract(float %x, float %y, float %z) { +; GFX9-LABEL: test_f32_add_mul_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_f32_add_mul_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_f32_add_mul_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_f32_add_mul_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_f32_add_mul_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_f32_add_mul_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_f32_add_mul_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_f32_add_mul_contract: +; GFX10-UNSAFE: ; %bb.0: ; %.entry +; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract float %x, %y + %b = fadd contract float %a, %z + ret float %b +} + define float @test_f32_add_mul_rhs(float %x, float %y, float %z) { ; GFX9-LABEL: test_f32_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry @@ -84,12 +127,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) { ; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_f32_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -108,7 +145,6 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) { ; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -120,6 +156,58 @@ define float @test_f32_add_mul_rhs(float %x, float %y, float %z) { ret float %b } +define float @test_f32_add_mul_rhs_contract(float %x, float %y, float %z) { +; GFX9-LABEL: test_f32_add_mul_rhs_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_f32_add_mul_rhs_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_f32_add_mul_rhs_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_f32_add_mul_rhs_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_f32_add_mul_rhs_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_f32_add_mul_rhs_contract: +; GFX10-UNSAFE: ; %bb.0: ; %.entry +; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract float %x, %y + %b = fadd contract float %z, %a + ret float %b +} + define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) { ; GFX9-LABEL: test_add_mul_multiple_defs_z: ; GFX9: ; %bb.0: ; %.entry @@ -147,14 +235,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) ; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_add_mul_multiple_defs_z: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -181,7 +261,6 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) ; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -198,17 +277,16 @@ define float @test_add_mul_multiple_defs_z(float %x, float %y, ptr addrspace(1) ret float %b } -define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) { -; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z: +define float @test_add_mul_multiple_defs_z_contract(float %x, float %y, ptr addrspace(1) %vec_ptr) { +; GFX9-LABEL: test_add_mul_multiple_defs_z_contract: ; GFX9: ; %bb.0: ; %.entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z: +; GFX9-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract: ; GFX9-CONTRACT: ; %bb.0: ; %.entry ; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4 @@ -216,7 +294,7 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace ; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z: +; GFX9-DENORM-LABEL: test_add_mul_multiple_defs_z_contract: ; GFX9-DENORM: ; %bb.0: ; %.entry ; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4 @@ -225,13 +303,81 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace ; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z: +; GFX10-LABEL: test_add_mul_multiple_defs_z_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_add_mul_multiple_defs_z_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-CONTRACT-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_add_mul_multiple_defs_z_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract: ; GFX9-UNSAFE: ; %bb.0: ; %.entry ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) ; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v1, v2 ; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_add_mul_multiple_defs_z_contract: +; GFX10-UNSAFE: ; %bb.0: ; %.entry +; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-UNSAFE-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) +; GFX10-UNSAFE-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract float %x, %y + %vec = load <2 x float>, ptr addrspace(1) %vec_ptr + %z = extractelement <2 x float> %vec, i64 1 + %b = fadd contract float %a, %z + ret float %b +} + +define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace(1) %vec_ptr) { +; GFX9-LABEL: test_add_mul_rhs_multiple_defs_z: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_add_mul_rhs_multiple_defs_z: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_add_mul_rhs_multiple_defs_z: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v[2:3], off offset:4 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX10: ; %bb.0: ; %.entry @@ -259,7 +405,6 @@ define float @test_add_mul_rhs_multiple_defs_z(float %x, float %y, ptr addrspace ; GFX10-DENORM-NEXT: v_mac_f32_e32 v2, v0, v1 ; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-UNSAFE-LABEL: test_add_mul_rhs_multiple_defs_z: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -296,12 +441,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) { ; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_half_add_mul: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_half_add_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -321,7 +460,6 @@ define half @test_half_add_mul(half %x, half %y, half %z) { ; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-UNSAFE-LABEL: test_half_add_mul: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -333,6 +471,59 @@ define half @test_half_add_mul(half %x, half %y, half %z) { ret half %b } +define half @test_half_add_mul_contract(half %x, half %y, half %z) { +; GFX9-LABEL: test_half_add_mul_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_half_add_mul_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_half_add_mul_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_half_add_mul_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_half_add_mul_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_half_add_mul_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_half_add_mul_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_half_add_mul_contract: +; GFX10-UNSAFE: ; %bb.0: ; %.entry +; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract half %x, %y + %b = fadd contract half %a, %z + ret half %b +} + define half @test_half_add_mul_rhs(half %x, half %y, half %z) { ; GFX9-LABEL: test_half_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry @@ -353,12 +544,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) { ; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_half_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -378,7 +563,6 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) { ; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v2, v0 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -390,6 +574,59 @@ define half @test_half_add_mul_rhs(half %x, half %y, half %z) { ret half %b } +define half @test_half_add_mul_rhs_contract(half %x, half %y, half %z) { +; GFX9-LABEL: test_half_add_mul_rhs_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_half_add_mul_rhs_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_half_add_mul_rhs_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_half_add_mul_rhs_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v2, v0 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_half_add_mul_rhs_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_half_add_mul_rhs_contract: +; GFX10-UNSAFE: ; %bb.0: ; %.entry +; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-UNSAFE-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract half %x, %y + %b = fadd contract half %z, %a + ret half %b +} + define double @test_double_add_mul(double %x, double %y, double %z) { ; GFX9-LABEL: test_double_add_mul: ; GFX9: ; %bb.0: ; %.entry @@ -411,12 +648,6 @@ define double @test_double_add_mul(double %x, double %y, double %z) { ; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_double_add_mul: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_double_add_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -436,15 +667,61 @@ define double @test_double_add_mul(double %x, double %y, double %z) { ; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul double %x, %y + %b = fadd double %a, %z + ret double %b +} + +define double @test_double_add_mul_contract(double %x, double %y, double %z) { +; GFX9-LABEL: test_double_add_mul_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_double_add_mul_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_double_add_mul_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; +; GFX10-LABEL: test_double_add_mul_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_double_add_mul_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_double_add_mul_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_double_add_mul_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; GFX10-UNSAFE-LABEL: test_double_add_mul: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: - %a = fmul double %x, %y - %b = fadd double %a, %z + %a = fmul contract double %x, %y + %b = fadd contract double %a, %z ret double %b } @@ -469,12 +746,6 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) { ; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_double_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -494,15 +765,61 @@ define double @test_double_add_mul_rhs(double %x, double %y, double %z) { ; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] ; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul double %x, %y + %b = fadd double %z, %a + ret double %b +} + +define double @test_double_add_mul_rhs_contract(double %x, double %y, double %z) { +; GFX9-LABEL: test_double_add_mul_rhs_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_double_add_mul_rhs_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs: +; GFX10-LABEL: test_double_add_mul_rhs_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_double_add_mul_rhs_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_double_add_mul_rhs_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_double_add_mul_rhs_contract: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: - %a = fmul double %x, %y - %b = fadd double %z, %a + %a = fmul contract double %x, %y + %b = fadd contract double %z, %a ret double %b } @@ -538,15 +855,6 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl ; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8 -; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v5, v9 -; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v6, v10 -; GFX9-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_4xfloat_add_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -577,8 +885,75 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl ; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10 ; GFX10-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <4 x float> %x, %y + %b = fadd <4 x float> %a, %z + ret <4 x float> %b +} + +define <4 x float> @test_4xfloat_add_mul_contract(<4 x float> %x, <4 x float> %y, <4 x float> %z) { +; GFX9-LABEL: test_4xfloat_add_mul_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX9-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX9-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX9-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_4xfloat_add_mul_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX9-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_4xfloat_add_mul_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v4, v8 +; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v5, v9 +; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10 +; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul: +; GFX10-LABEL: test_4xfloat_add_mul_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX10-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX10-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX10-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_4xfloat_add_mul_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX10-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_4xfloat_add_mul_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX10-DENORM-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX10-DENORM-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX10-DENORM-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_4xfloat_add_mul_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX9-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_4xfloat_add_mul_contract: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v4, v8 @@ -587,8 +962,8 @@ define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x fl ; GFX10-UNSAFE-NEXT: v_fma_f32 v3, v3, v7, v11 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: - %a = fmul <4 x float> %x, %y - %b = fadd <4 x float> %a, %z + %a = fmul contract <4 x float> %x, %y + %b = fadd contract <4 x float> %a, %z ret <4 x float> %b } @@ -620,14 +995,6 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3 ; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6 -; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v4, v7 -; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_3xfloat_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -654,8 +1021,68 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3 ; GFX10-DENORM-NEXT: v_mad_f32 v1, v1, v4, v7 ; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <3 x float> %x, %y + %b = fadd <3 x float> %z, %a + ret <3 x float> %b +} + +define <3 x float> @test_3xfloat_add_mul_rhs_contract(<3 x float> %x, <3 x float> %y, <3 x float> %z) { +; GFX9-LABEL: test_3xfloat_add_mul_rhs_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX9-NEXT: v_fma_f32 v1, v1, v4, v7 +; GFX9-NEXT: v_fma_f32 v2, v2, v5, v8 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v4, v7 +; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v5, v8 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs: +; GFX9-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v3, v6 +; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v4, v7 +; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v5, v8 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_3xfloat_add_mul_rhs_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX10-NEXT: v_fma_f32 v1, v1, v4, v7 +; GFX10-NEXT: v_fma_f32 v2, v2, v5, v8 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_3xfloat_add_mul_rhs_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v4, v7 +; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v5, v8 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_3xfloat_add_mul_rhs_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX10-DENORM-NEXT: v_fma_f32 v1, v1, v4, v7 +; GFX10-DENORM-NEXT: v_fma_f32 v2, v2, v5, v8 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6 +; GFX9-UNSAFE-NEXT: v_fma_f32 v1, v1, v4, v7 +; GFX9-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_3xfloat_add_mul_rhs_contract: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: v_fma_f32 v0, v0, v3, v6 @@ -663,8 +1090,8 @@ define <3 x float> @test_3xfloat_add_mul_rhs(<3 x float> %x, <3 x float> %y, <3 ; GFX10-UNSAFE-NEXT: v_fma_f32 v2, v2, v5, v8 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: - %a = fmul <3 x float> %x, %y - %b = fadd <3 x float> %z, %a + %a = fmul contract <3 x float> %x, %y + %b = fadd contract <3 x float> %z, %a ret <3 x float> %b } @@ -694,13 +1121,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half> ; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_4xhalf_add_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -725,7 +1145,6 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half> ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -738,6 +1157,70 @@ define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half> ret <4 x half> %b } +define <4 x half> @test_4xhalf_add_mul_contract(<4 x half> %x, <4 x half> %y, <4 x half> %z) { +; GFX9-LABEL: test_4xhalf_add_mul_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_4xhalf_add_mul_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_4xhalf_add_mul_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_4xhalf_add_mul_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_4xhalf_add_mul_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_4xhalf_add_mul_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_4xhalf_add_mul_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_4xhalf_add_mul_contract: +; GFX10-UNSAFE: ; %bb.0: ; %.entry +; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract <4 x half> %x, %y + %b = fadd contract <4 x half> %a, %z + ret <4 x half> %b +} + define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x half> %z) { ; GFX9-LABEL: test_3xhalf_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry @@ -764,13 +1247,6 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha ; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_3xhalf_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -795,16 +1271,73 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <3 x half> %x, %y + %b = fadd <3 x half> %z, %a + ret <3 x half> %b +} + +define <3 x half> @test_3xhalf_add_mul_rhs_contract(<3 x half> %x, <3 x half> %y, <3 x half> %z) { +; GFX9-LABEL: test_3xhalf_add_mul_rhs_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_3xhalf_add_mul_rhs_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: +; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v4, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v5, v1 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +; GFX9-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] +; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs_contract: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v3, v5 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: - %a = fmul <3 x half> %x, %y - %b = fadd <3 x half> %z, %a + %a = fmul contract <3 x half> %x, %y + %b = fadd contract <3 x half> %z, %a ret <3 x half> %b } @@ -844,15 +1377,6 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4 ; GFX9-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] -; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] -; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] -; GFX9-UNSAFE-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_4xdouble_add_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -887,7 +1411,14 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4 ; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], v[20:21] ; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; +; GFX9-UNSAFE-LABEL: test_4xdouble_add_mul: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-UNSAFE-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; GFX10-UNSAFE-LABEL: test_4xdouble_add_mul: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -902,6 +1433,66 @@ define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4 ret <4 x double> %b } +define <4 x double> @test_4xdouble_add_mul_contract(<4 x double> %x, <4 x double> %y, <4 x double> %z) { +; GFX9-LABEL: test_4xdouble_add_mul_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_4xdouble_add_mul_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_4xdouble_add_mul_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_4xdouble_add_mul_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_4xdouble_add_mul_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_4xdouble_add_mul_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX10-DENORM-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract <4 x double> %x, %y + %b = fadd contract <4 x double> %a, %z + ret <4 x double> %b +} + define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, <3 x double> %z) { ; GFX9-LABEL: test_3xdouble_add_mul_rhs: ; GFX9: ; %bb.0: ; %.entry @@ -933,14 +1524,6 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, ; GFX9-DENORM-NEXT: v_add_f64 v[4:5], v[16:17], v[4:5] ; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs: -; GFX9-UNSAFE: ; %bb.0: ; %.entry -; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] -; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] -; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] -; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] -; ; GFX10-LABEL: test_3xdouble_add_mul_rhs: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -970,7 +1553,13 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, ; GFX10-DENORM-NEXT: v_add_f64 v[2:3], v[14:15], v[2:3] ; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[16:17], v[4:5] ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] -; +; GFX9-UNSAFE-LABEL: test_3xdouble_add_mul_rhs: +; GFX9-UNSAFE: ; %bb.0: ; %.entry +; GFX9-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX9-UNSAFE-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX9-UNSAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX9-UNSAFE-NEXT: s_setpc_b64 s[30:31] ; GFX10-UNSAFE-LABEL: test_3xdouble_add_mul_rhs: ; GFX10-UNSAFE: ; %bb.0: ; %.entry ; GFX10-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -983,3 +1572,57 @@ define <3 x double> @test_3xdouble_add_mul_rhs(<3 x double> %x, <3 x double> %y, %b = fadd <3 x double> %z, %a ret <3 x double> %b } + +define <3 x double> @test_3xdouble_add_mul_rhs_contract(<3 x double> %x, <3 x double> %y, <3 x double> %z) { +; GFX9-LABEL: test_3xdouble_add_mul_rhs_contract: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX9-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX9-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_3xdouble_add_mul_rhs_contract: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX10-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_3xdouble_add_mul_rhs_contract: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_3xdouble_add_mul_rhs_contract: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13] +; GFX10-DENORM-NEXT: v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15] +; GFX10-DENORM-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul contract <3 x double> %x, %y + %b = fadd contract <3 x double> %z, %a + ret <3 x double> %b +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir index 2845a63..d9ac9a7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir @@ -24,8 +24,8 @@ body: | %ptr:_(p1) = COPY $vgpr2_vgpr3 %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) - %6:_(s32) = G_FMUL %0, %1 - %7:_(s32) = G_FADD %6, %el1 + %6:_(s32) = contract G_FMUL %0, %1 + %7:_(s32) = contract G_FADD %6, %el1 $vgpr0 = COPY %7(s32) ... @@ -54,8 +54,8 @@ body: | %ptr:_(p1) = COPY $vgpr2_vgpr3 %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) - %6:_(s32) = G_FMUL %0, %1 - %7:_(s32) = G_FADD %el1, %6 + %6:_(s32) = contract G_FMUL %0, %1 + %7:_(s32) = contract G_FADD %el1, %6 $vgpr0 = COPY %7(s32) ... @@ -233,10 +233,10 @@ body: | %7:_(s16) = G_TRUNC %6(s32) %8:_(s32) = COPY $vgpr5 %9:_(s16) = G_TRUNC %8(s32) - %10:_(s16) = G_FMUL %7, %9 + %10:_(s16) = contract G_FMUL %7, %9 %11:_(s32) = G_FPEXT %10(s16) %12:_(s32) = G_FMA %0, %1, %11 - %13:_(s32) = G_FADD %12, %el1 + %13:_(s32) = contract G_FADD %12, %el1 $vgpr0 = COPY %13(s32) ... @@ -282,11 +282,11 @@ body: | %9:_(s16) = G_TRUNC %8(s32) %10:_(s32) = COPY $vgpr5 %11:_(s16) = G_TRUNC %10(s32) - %12:_(s16) = G_FMUL %9, %11 - %13:_(s16) = G_FMUL %1, %3 - %14:_(s16) = G_FADD %13, %12 + %12:_(s16) = contract G_FMUL %9, %11 + %13:_(s16) = contract G_FMUL %1, %3 + %14:_(s16) = contract G_FADD %13, %12 %15:_(s32) = G_FPEXT %14(s16) - %16:_(s32) = G_FADD %15, %el1 + %16:_(s32) = contract G_FADD %15, %el1 $vgpr0 = COPY %16(s32) ... @@ -326,10 +326,10 @@ body: | %7:_(s16) = G_TRUNC %6(s32) %8:_(s32) = COPY $vgpr5 %9:_(s16) = G_TRUNC %8(s32) - %10:_(s16) = G_FMUL %7, %9 + %10:_(s16) = contract G_FMUL %7, %9 %11:_(s32) = G_FPEXT %10(s16) %12:_(s32) = G_FMA %4, %5, %11 - %13:_(s32) = G_FADD %el1, %12 + %13:_(s32) = contract G_FADD %el1, %12 $vgpr0 = COPY %13(s32) ... @@ -375,11 +375,11 @@ body: | %9:_(s16) = G_TRUNC %8(s32) %10:_(s32) = COPY $vgpr5 %11:_(s16) = G_TRUNC %10(s32) - %12:_(s16) = G_FMUL %9, %11 - %13:_(s16) = G_FMUL %5, %7 - %14:_(s16) = G_FADD %13, %12 + %12:_(s16) = contract G_FMUL %9, %11 + %13:_(s16) = contract G_FMUL %5, %7 + %14:_(s16) = contract G_FADD %13, %12 %15:_(s32) = G_FPEXT %14(s16) - %16:_(s32) = G_FADD %el1, %15 + %16:_(s32) = contract G_FADD %el1, %15 $vgpr0 = COPY %16(s32) ... @@ -409,8 +409,8 @@ body: | %ptr:_(p1) = COPY $vgpr0_vgpr1 %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) - %6:_(s32) = G_FMUL %0, %1 - %7:_(s32) = G_FSUB %6, %el1 + %6:_(s32) = contract G_FMUL %0, %1 + %7:_(s32) = contract G_FSUB %6, %el1 $vgpr0 = COPY %7(s32) ... @@ -440,7 +440,7 @@ body: | %ptr:_(p1) = COPY $vgpr2_vgpr3 %vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1) %el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>) - %6:_(s32) = G_FMUL %0, %1 - %7:_(s32) = G_FSUB %el1, %6 + %6:_(s32) = contract G_FMUL %0, %1 + %7:_(s32) = contract G_FSUB %el1, %6 $vgpr0 = COPY %7(s32) ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll index c4d57ac..da25ac0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -12,7 +12,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX942-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0) ; GFX942-NEXT: S_ENDPGM 0 ; ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic @@ -23,7 +23,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(ptr %ptr, float %da ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0) ; GFX11-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) ret void @@ -38,7 +38,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX942-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX942-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX942-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0) ; GFX942-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX942-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; @@ -50,7 +50,7 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(ptr %ptr, float %data ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr) + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr %ptr, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll index c82ae2fb..bf36979 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll @@ -13,7 +13,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0) ; GFX90A_GFX942-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0 ret void @@ -30,7 +30,7 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da ; GFX90A_GFX942-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX90A_GFX942-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; GFX90A_GFX942-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 - ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr) + ; GFX90A_GFX942-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr, !noalias.addrspace !0) ; GFX90A_GFX942-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0 ; GFX90A_GFX942-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1 ; GFX90A_GFX942-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir index f513de8..477ef32 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fptrunc.mir @@ -385,117 +385,16 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) - ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]] - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008 - ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[C2]] - ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32) - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094 - ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]] - ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511 - ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]] - ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]] - ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]] - ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1) - ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]] - ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512 - ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]] - ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]] - ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744 - ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]] - ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C9]](s32) - ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]] - ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD]] - ; CHECK-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]] - ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13 - ; CHECK-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]] - ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096 - ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]] - ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32) - ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32) - ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]] - ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1) - ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]] - ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[C10]] - ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]] - ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 - ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]] - ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32) - ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 - ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]] - ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1) - ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 - ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]] - ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1) - ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]] - ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]] - ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30 - ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD]](s32), [[C17]] - ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]] - ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039 - ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C18]] - ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]] - ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32) - ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768 - ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]] - ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]] - ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) - ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32) - ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]] - ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[C2]] - ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32) - ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]] - ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]] - ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]] - ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]] - ; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1) - ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]] - ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]] - ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]] - ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]] - ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ADD2]], [[C9]](s32) - ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]] - ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD2]] - ; CHECK-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]] - ; CHECK-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]] - ; CHECK-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]] - ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32) - ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32) - ; CHECK-NEXT: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]] - ; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1) - ; CHECK-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]] - ; CHECK-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD2]](s32), [[C10]] - ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]] - ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]] - ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32) - ; CHECK-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]] - ; CHECK-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1) - ; CHECK-NEXT: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]] - ; CHECK-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1) - ; CHECK-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]] - ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]] - ; CHECK-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD2]](s32), [[C17]] - ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]] - ; CHECK-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD2]](s32), [[C18]] - ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]] - ; CHECK-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32) - ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]] - ; CHECK-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]] - ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[OR7]], [[C21]] - ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[OR15]], [[C21]] - ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32) - ; CHECK-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]] - ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32) + ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV]](s64) + ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC]](s32) + ; CHECK-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV1]](s64) + ; CHECK-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC2]](s32) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16) + ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC3]](s16) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]] + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32) ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s16>) = afn G_FPTRUNC %0 diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index d0b41e1..57b4857 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s define amdgpu_kernel void @fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fptrunc_f32_to_f16: @@ -201,8 +201,8 @@ entry: ret void } -define amdgpu_kernel void @fptrunc_f64_to_f16( -; SI-SDAG-LABEL: fptrunc_f64_to_f16: +define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r, +; SI-SDAG-LABEL: fptrunc_f32_to_f16_afn: ; SI-SDAG: ; %bb.0: ; %entry ; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -212,29 +212,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SI-SDAG-NEXT: s_mov_b32 s8, s2 ; SI-SDAG-NEXT: s_mov_b32 s9, s3 -; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-SDAG-NEXT: s_mov_b32 s4, s0 ; SI-SDAG-NEXT: s_mov_b32 s5, s1 ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; -; SI-GISEL-LABEL: fptrunc_f64_to_f16: +; SI-GISEL-LABEL: fptrunc_f32_to_f16_afn: ; SI-GISEL: ; %bb.0: ; %entry ; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-GISEL-NEXT: s_load_dword s3, s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s3 ; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; -; VI-SDAG-LABEL: fptrunc_f64_to_f16: +; VI-SDAG-LABEL: fptrunc_f32_to_f16_afn: ; VI-SDAG: ; %bb.0: ; %entry ; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -244,29 +242,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; VI-SDAG-NEXT: s_mov_b32 s8, s2 ; VI-SDAG-NEXT: s_mov_b32 s9, s3 -; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; VI-SDAG-NEXT: s_mov_b32 s4, s0 ; VI-SDAG-NEXT: s_mov_b32 s5, s1 ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-SDAG-NEXT: s_endpgm ; -; VI-GISEL-LABEL: fptrunc_f64_to_f16: +; VI-GISEL-LABEL: fptrunc_f32_to_f16_afn: ; VI-GISEL: ; %bb.0: ; %entry ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; VI-GISEL-NEXT: s_mov_b32 s2, -1 -; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: fptrunc_f64_to_f16: +; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_afn: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -276,29 +272,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 ; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 ; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; -; GFX9-GISEL-LABEL: fptrunc_f64_to_f16: +; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_afn: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 ; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX950-SDAG-LABEL: fptrunc_f64_to_f16: +; GFX950-SDAG-LABEL: fptrunc_f32_to_f16_afn: ; GFX950-SDAG: ; %bb.0: ; %entry ; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 @@ -308,23 +302,541 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 ; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 ; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX950-SDAG-NEXT: s_endpgm ; +; GFX950-GISEL-LABEL: fptrunc_f32_to_f16_afn: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_afn: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_afn: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_afn: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_afn: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm + ptr addrspace(1) %a) { +entry: + %a.val = load float, ptr addrspace(1) %a + %r.val = fptrunc afn float %a.val to half + store half %r.val, ptr addrspace(1) %r + ret void +} + +define amdgpu_kernel void @fptrunc_f64_to_f16( +; SI-SDAG-LABEL: fptrunc_f64_to_f16: +; SI-SDAG: ; %bb.0: ; %entry +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, s2 +; SI-SDAG-NEXT: s_mov_b32 s11, s3 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s8, s6 +; SI-SDAG-NEXT: s_mov_b32 s9, s7 +; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: s_and_b32 s6, s1, 0x1ff +; SI-SDAG-NEXT: s_lshr_b32 s7, s1, 8 +; SI-SDAG-NEXT: s_bfe_u32 s8, s1, 0xb0014 +; SI-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 +; SI-SDAG-NEXT: s_and_b32 s6, s7, 0xffe +; SI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; SI-SDAG-NEXT: s_or_b32 s6, s6, s7 +; SI-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 +; SI-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; SI-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; SI-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; SI-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; SI-SDAG-NEXT: s_or_b32 s7, s10, s7 +; SI-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; SI-SDAG-NEXT: s_or_b32 s9, s6, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; SI-SDAG-NEXT: s_and_b32 s9, s7, 7 +; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; SI-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; SI-SDAG-NEXT: s_or_b32 s9, s9, s10 +; SI-SDAG-NEXT: s_add_i32 s7, s7, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; SI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-SDAG-NEXT: s_cmp_lg_u32 s6, 0 +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00 +; SI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s7 +; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16 +; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-SDAG-NEXT: s_or_b32 s6, s1, s0 +; SI-SDAG-NEXT: s_mov_b32 s0, s4 +; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; SI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: fptrunc_f64_to_f16: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-GISEL-NEXT: s_lshr_b32 s6, s5, 8 +; SI-GISEL-NEXT: s_and_b32 s7, s5, 0x1ff +; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 +; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe +; SI-GISEL-NEXT: s_or_b32 s4, s7, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s4, s6, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 +; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12 +; SI-GISEL-NEXT: s_sub_i32 s8, 1, s3 +; SI-GISEL-NEXT: s_or_b32 s9, s4, 0x1000 +; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00 +; SI-GISEL-NEXT: s_or_b32 s4, s4, s7 +; SI-GISEL-NEXT: s_max_i32 s7, s8, 0 +; SI-GISEL-NEXT: s_min_i32 s7, s7, 13 +; SI-GISEL-NEXT: s_lshr_b32 s8, s9, s7 +; SI-GISEL-NEXT: s_lshl_b32 s7, s8, s7 +; SI-GISEL-NEXT: s_cmp_lg_u32 s7, s9 +; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s7, s8, s7 +; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1 +; SI-GISEL-NEXT: s_cselect_b32 s4, s7, s4 +; SI-GISEL-NEXT: s_and_b32 s7, s4, 7 +; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2 +; SI-GISEL-NEXT: s_cmp_eq_u32 s7, 3 +; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; SI-GISEL-NEXT: s_cmp_gt_i32 s7, 5 +; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s7, s8, s7 +; SI-GISEL-NEXT: s_add_i32 s4, s4, s7 +; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30 +; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4 +; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f +; SI-GISEL-NEXT: s_cselect_b32 s3, s6, s4 +; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16 +; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-GISEL-NEXT: s_or_b32 s4, s4, s3 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_f64_to_f16: +; VI-SDAG: ; %bb.0: ; %entry +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_mov_b32 s10, s2 +; VI-SDAG-NEXT: s_mov_b32 s11, s3 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s8, s6 +; VI-SDAG-NEXT: s_mov_b32 s9, s7 +; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-SDAG-NEXT: s_mov_b32 s0, s4 +; VI-SDAG-NEXT: s_mov_b32 s1, s5 +; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v1 +; VI-SDAG-NEXT: s_and_b32 s5, s4, 0x1ff +; VI-SDAG-NEXT: v_or_b32_e32 v0, s5, v0 +; VI-SDAG-NEXT: s_lshr_b32 s7, s4, 8 +; VI-SDAG-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-SDAG-NEXT: s_and_b32 s5, s7, 0xffe +; VI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; VI-SDAG-NEXT: s_or_b32 s5, s5, s7 +; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; VI-SDAG-NEXT: s_or_b32 s7, s5, 0x1000 +; VI-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; VI-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; VI-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; VI-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; VI-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; VI-SDAG-NEXT: s_or_b32 s7, s10, s7 +; VI-SDAG-NEXT: s_or_b32 s9, s5, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; VI-SDAG-NEXT: s_and_b32 s9, s7, 7 +; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; VI-SDAG-NEXT: s_or_b32 s9, s9, s10 +; VI-SDAG-NEXT: s_add_i32 s7, s7, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; VI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; VI-SDAG-NEXT: s_cmp_lg_u32 s5, 0 +; VI-SDAG-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; VI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s7 +; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16 +; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 +; VI-SDAG-NEXT: s_or_b32 s4, s4, s5 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_f64_to_f16: +; VI-GISEL: ; %bb.0: ; %entry +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; VI-GISEL-NEXT: s_lshr_b32 s5, s3, 8 +; VI-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 +; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe +; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 +; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 +; VI-GISEL-NEXT: s_max_i32 s7, s7, 0 +; VI-GISEL-NEXT: s_or_b32 s6, s2, s6 +; VI-GISEL-NEXT: s_min_i32 s7, s7, 13 +; VI-GISEL-NEXT: s_bitset1_b32 s2, 12 +; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; VI-GISEL-NEXT: s_lshr_b32 s8, s2, s7 +; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; VI-GISEL-NEXT: s_lshl_b32 s7, s8, s7 +; VI-GISEL-NEXT: s_cmp_lg_u32 s7, s2 +; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s2, s8, s2 +; VI-GISEL-NEXT: s_cmp_lt_i32 s4, 1 +; VI-GISEL-NEXT: s_cselect_b32 s2, s2, s6 +; VI-GISEL-NEXT: s_and_b32 s6, s2, 7 +; VI-GISEL-NEXT: s_lshr_b32 s2, s2, 2 +; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; VI-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s6, s7, s6 +; VI-GISEL-NEXT: s_add_i32 s2, s2, s6 +; VI-GISEL-NEXT: s_cmp_gt_i32 s4, 30 +; VI-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; VI-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f +; VI-GISEL-NEXT: s_cselect_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 16 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: fptrunc_f64_to_f16: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX9-SDAG-NEXT: s_lshr_b32 s7, s5, 8 +; GFX9-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: s_and_b32 s6, s7, 0xffe +; GFX9-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 +; GFX9-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; GFX9-SDAG-NEXT: s_or_b32 s7, s10, s7 +; GFX9-SDAG-NEXT: s_or_b32 s9, s6, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 7 +; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s10 +; GFX9-SDAG-NEXT: s_add_i32 s7, s7, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s7 +; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX9-SDAG-NEXT: s_or_b32 s4, s5, s4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: fptrunc_f64_to_f16: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; GFX9-GISEL-NEXT: s_lshr_b32 s5, s3, 8 +; GFX9-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10 +; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe +; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12 +; GFX9-GISEL-NEXT: s_max_i32 s7, s7, 0 +; GFX9-GISEL-NEXT: s_or_b32 s6, s2, s6 +; GFX9-GISEL-NEXT: s_min_i32 s7, s7, 13 +; GFX9-GISEL-NEXT: s_bitset1_b32 s2, 12 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; GFX9-GISEL-NEXT: s_lshr_b32 s8, s2, s7 +; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, s7 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s7, s2 +; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s2, s8, s2 +; GFX9-GISEL-NEXT: s_cmp_lt_i32 s4, 1 +; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-GISEL-NEXT: s_and_b32 s6, s2, 7 +; GFX9-GISEL-NEXT: s_lshr_b32 s2, s2, 2 +; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s6, s7, s6 +; GFX9-GISEL-NEXT: s_add_i32 s2, s2, s6 +; GFX9-GISEL-NEXT: s_cmp_gt_i32 s4, 30 +; GFX9-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX9-GISEL-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: fptrunc_f64_to_f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX950-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX950-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX950-SDAG-NEXT: s_lshr_b32 s7, s5, 8 +; GFX950-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-SDAG-NEXT: s_and_b32 s6, s7, 0xffe +; GFX950-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX950-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX950-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 +; GFX950-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX950-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; GFX950-SDAG-NEXT: s_or_b32 s7, s10, s7 +; GFX950-SDAG-NEXT: s_or_b32 s9, s6, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; GFX950-SDAG-NEXT: s_and_b32 s9, s7, 7 +; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX950-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s10 +; GFX950-SDAG-NEXT: s_add_i32 s7, s7, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s6, 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s7 +; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX950-SDAG-NEXT: s_or_b32 s4, s5, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-SDAG-NEXT: s_endpgm +; ; GFX950-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX950-GISEL: ; %bb.0: ; %entry ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; GFX950-GISEL-NEXT: s_lshr_b32 s5, s3, 8 +; GFX950-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10 +; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe +; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4 +; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12 +; GFX950-GISEL-NEXT: s_max_i32 s7, s7, 0 +; GFX950-GISEL-NEXT: s_or_b32 s6, s2, s6 +; GFX950-GISEL-NEXT: s_min_i32 s7, s7, 13 +; GFX950-GISEL-NEXT: s_bitset1_b32 s2, 12 +; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; GFX950-GISEL-NEXT: s_lshr_b32 s8, s2, s7 +; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX950-GISEL-NEXT: s_lshl_b32 s7, s8, s7 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s7, s2 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s2, s8, s2 +; GFX950-GISEL-NEXT: s_cmp_lt_i32 s4, 1 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, s2, s6 +; GFX950-GISEL-NEXT: s_and_b32 s6, s2, 7 +; GFX950-GISEL-NEXT: s_lshr_b32 s2, s2, 2 +; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; GFX950-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s6, s7, s6 +; GFX950-GISEL-NEXT: s_add_i32 s2, s2, s6 +; GFX950-GISEL-NEXT: s_cmp_gt_i32 s4, 30 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX950-GISEL-NEXT: s_cselect_b32 s2, s5, s2 +; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 ; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -340,13 +852,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 ; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s4, 0, 13 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s4, s8, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s4, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; @@ -360,13 +919,60 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 ; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s4, 0, 13 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s4, s8, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s4, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; @@ -376,6 +982,555 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000 +; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s7, s4, 12 +; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s9, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s2, s7 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s9, s6 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s8 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s9, s6 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s6, s2 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s2, 7 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s2, s2, 2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s7, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s2, s2, s6 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s5, s2 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000 +; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s7, s4, 12 +; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s9, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s2, s7 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s9, s6 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s8 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s9, s6 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s6, s2 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s2, 7 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s2, s2, 2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s7, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s7, s6 +; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s2, s2, s6 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s5, s2 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm + ptr addrspace(1) %r, + ptr addrspace(1) %a) { +entry: + %a.val = load double, ptr addrspace(1) %a + %r.val = fptrunc double %a.val to half + store half %r.val, ptr addrspace(1) %r + ret void +} + +define amdgpu_kernel void @fptrunc_f64_to_f16_afn( +; SI-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; SI-SDAG: ; %bb.0: ; %entry +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, s2 +; SI-SDAG-NEXT: s_mov_b32 s11, s3 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s8, s6 +; SI-SDAG-NEXT: s_mov_b32 s9, s7 +; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: s_and_b32 s6, s1, 0x1ff +; SI-SDAG-NEXT: s_lshr_b32 s7, s1, 8 +; SI-SDAG-NEXT: s_bfe_u32 s8, s1, 0xb0014 +; SI-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 +; SI-SDAG-NEXT: s_and_b32 s6, s7, 0xffe +; SI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; SI-SDAG-NEXT: s_or_b32 s6, s6, s7 +; SI-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 +; SI-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; SI-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; SI-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; SI-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; SI-SDAG-NEXT: s_or_b32 s7, s10, s7 +; SI-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; SI-SDAG-NEXT: s_or_b32 s9, s6, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; SI-SDAG-NEXT: s_and_b32 s9, s7, 7 +; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; SI-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; SI-SDAG-NEXT: s_or_b32 s9, s9, s10 +; SI-SDAG-NEXT: s_add_i32 s7, s7, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; SI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; SI-SDAG-NEXT: s_cmp_lg_u32 s6, 0 +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00 +; SI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s7 +; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16 +; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-SDAG-NEXT: s_or_b32 s6, s1, s0 +; SI-SDAG-NEXT: s_mov_b32 s0, s4 +; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; SI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: fptrunc_f64_to_f16_afn: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; VI-SDAG: ; %bb.0: ; %entry +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_mov_b32 s10, s2 +; VI-SDAG-NEXT: s_mov_b32 s11, s3 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s8, s6 +; VI-SDAG-NEXT: s_mov_b32 s9, s7 +; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; VI-SDAG-NEXT: s_mov_b32 s0, s4 +; VI-SDAG-NEXT: s_mov_b32 s1, s5 +; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v1 +; VI-SDAG-NEXT: s_and_b32 s5, s4, 0x1ff +; VI-SDAG-NEXT: v_or_b32_e32 v0, s5, v0 +; VI-SDAG-NEXT: s_lshr_b32 s7, s4, 8 +; VI-SDAG-NEXT: s_bfe_u32 s8, s4, 0xb0014 +; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-SDAG-NEXT: s_and_b32 s5, s7, 0xffe +; VI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; VI-SDAG-NEXT: s_or_b32 s5, s5, s7 +; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; VI-SDAG-NEXT: s_or_b32 s7, s5, 0x1000 +; VI-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; VI-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; VI-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; VI-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; VI-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; VI-SDAG-NEXT: s_or_b32 s7, s10, s7 +; VI-SDAG-NEXT: s_or_b32 s9, s5, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; VI-SDAG-NEXT: s_and_b32 s9, s7, 7 +; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; VI-SDAG-NEXT: s_or_b32 s9, s9, s10 +; VI-SDAG-NEXT: s_add_i32 s7, s7, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; VI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; VI-SDAG-NEXT: s_cmp_lg_u32 s5, 0 +; VI-SDAG-NEXT: s_cselect_b32 s5, s6, 0x7c00 +; VI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s7 +; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16 +; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 +; VI-SDAG-NEXT: s_or_b32 s4, s4, s5 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn: +; VI-GISEL: ; %bb.0: ; %entry +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX9-SDAG-NEXT: s_lshr_b32 s7, s5, 8 +; GFX9-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: s_and_b32 s6, s7, 0xffe +; GFX9-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 +; GFX9-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; GFX9-SDAG-NEXT: s_or_b32 s7, s10, s7 +; GFX9-SDAG-NEXT: s_or_b32 s9, s6, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 7 +; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s10 +; GFX9-SDAG-NEXT: s_add_i32 s7, s7, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s7 +; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX9-SDAG-NEXT: s_or_b32 s4, s5, s4 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: fptrunc_f64_to_f16_afn: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX950-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; GFX950-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX950-SDAG-NEXT: s_lshr_b32 s7, s5, 8 +; GFX950-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-SDAG-NEXT: s_and_b32 s6, s7, 0xffe +; GFX950-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX950-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX950-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 +; GFX950-SDAG-NEXT: s_lshr_b32 s10, s7, s9 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, s9 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s7 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX950-SDAG-NEXT: s_addk_i32 s8, 0xfc10 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s8, 12 +; GFX950-SDAG-NEXT: s_or_b32 s7, s10, s7 +; GFX950-SDAG-NEXT: s_or_b32 s9, s6, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 1 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s9 +; GFX950-SDAG-NEXT: s_and_b32 s9, s7, 7 +; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX950-SDAG-NEXT: s_lshr_b32 s7, s7, 2 +; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s10 +; GFX950-SDAG-NEXT: s_add_i32 s7, s7, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 31 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s6, 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s7 +; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX950-SDAG-NEXT: s_or_b32 s4, s5, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_f64_to_f16_afn: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s4, 0, 13 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s4, s8, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s4, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v1 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s4, 0, 13 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s4, s8, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s4, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 @@ -384,7 +1539,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; -; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry ; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -401,7 +1556,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ptr addrspace(1) %a) { entry: %a.val = load double, ptr addrspace(1) %a - %r.val = fptrunc double %a.val to half + %r.val = fptrunc afn double %a.val to half store half %r.val, ptr addrspace(1) %r ret void } @@ -626,25 +1781,106 @@ entry: define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s6, -1 -; SI-SDAG-NEXT: s_mov_b32 s10, s6 -; SI-SDAG-NEXT: s_mov_b32 s11, s7 +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, s2 +; SI-SDAG-NEXT: s_mov_b32 s11, s3 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s8, s2 -; SI-SDAG-NEXT: s_mov_b32 s9, s3 +; SI-SDAG-NEXT: s_mov_b32 s8, s6 +; SI-SDAG-NEXT: s_mov_b32 s9, s7 ; SI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-SDAG-NEXT: s_mov_b32 s4, s0 -; SI-SDAG-NEXT: s_mov_b32 s5, s1 +; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00 ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; SI-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; SI-SDAG-NEXT: s_and_b32 s7, s1, 0x1ff +; SI-SDAG-NEXT: s_lshr_b32 s8, s1, 8 +; SI-SDAG-NEXT: s_bfe_u32 s9, s1, 0xb0014 +; SI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; SI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; SI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; SI-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; SI-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; SI-SDAG-NEXT: s_or_b32 s7, s7, s8 +; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; SI-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; SI-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; SI-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; SI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; SI-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; SI-SDAG-NEXT: s_or_b32 s10, s7, s10 +; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; SI-SDAG-NEXT: s_and_b32 s10, s8, 7 +; SI-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; SI-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; SI-SDAG-NEXT: s_or_b32 s10, s10, s11 +; SI-SDAG-NEXT: s_add_i32 s8, s8, s10 +; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; SI-SDAG-NEXT: s_cselect_b32 s7, s0, 0x7c00 +; SI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16 +; SI-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff +; SI-SDAG-NEXT: s_lshr_b32 s9, s6, 8 +; SI-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 +; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; SI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; SI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; SI-SDAG-NEXT: s_or_b32 s1, s1, s7 +; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; SI-SDAG-NEXT: s_lshl_b32 s1, s1, 16 +; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; SI-SDAG-NEXT: s_or_b32 s7, s8, s7 +; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; SI-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; SI-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; SI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; SI-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; SI-SDAG-NEXT: s_or_b32 s9, s7, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; SI-SDAG-NEXT: s_and_b32 s9, s8, 7 +; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; SI-SDAG-NEXT: s_or_b32 s9, s9, s11 +; SI-SDAG-NEXT: s_add_i32 s8, s8, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00 +; SI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s8 +; SI-SDAG-NEXT: s_lshr_b32 s6, s6, 16 +; SI-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-SDAG-NEXT: s_or_b32 s0, s6, s0 +; SI-SDAG-NEXT: s_and_b32 s0, s0, 0xffff +; SI-SDAG-NEXT: s_or_b32 s6, s0, s1 +; SI-SDAG-NEXT: s_mov_b32 s0, s4 +; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: @@ -654,6 +1890,1251 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; SI-GISEL-NEXT: s_mov_b32 s2, -1 ; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014 +; SI-GISEL-NEXT: s_lshr_b32 s8, s5, 8 +; SI-GISEL-NEXT: s_and_b32 s9, s5, 0x1ff +; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 +; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe +; SI-GISEL-NEXT: s_or_b32 s4, s9, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9 +; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12 +; SI-GISEL-NEXT: s_sub_i32 s10, 1, s3 +; SI-GISEL-NEXT: s_or_b32 s11, s4, 0x1000 +; SI-GISEL-NEXT: s_or_b32 s8, s8, 0x7c00 +; SI-GISEL-NEXT: s_or_b32 s4, s4, s9 +; SI-GISEL-NEXT: s_max_i32 s9, s10, 0 +; SI-GISEL-NEXT: s_min_i32 s9, s9, 13 +; SI-GISEL-NEXT: s_lshr_b32 s10, s11, s9 +; SI-GISEL-NEXT: s_lshl_b32 s9, s10, s9 +; SI-GISEL-NEXT: s_cmp_lg_u32 s9, s11 +; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s9, s10, s9 +; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1 +; SI-GISEL-NEXT: s_cselect_b32 s4, s9, s4 +; SI-GISEL-NEXT: s_and_b32 s9, s4, 7 +; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2 +; SI-GISEL-NEXT: s_cmp_eq_u32 s9, 3 +; SI-GISEL-NEXT: s_cselect_b32 s10, 1, 0 +; SI-GISEL-NEXT: s_cmp_gt_i32 s9, 5 +; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s9, s10, s9 +; SI-GISEL-NEXT: s_add_i32 s4, s4, s9 +; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30 +; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4 +; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f +; SI-GISEL-NEXT: s_cselect_b32 s3, s8, s4 +; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16 +; SI-GISEL-NEXT: s_bfe_u32 s5, s7, 0xb0014 +; SI-GISEL-NEXT: s_lshr_b32 s8, s7, 8 +; SI-GISEL-NEXT: s_and_b32 s9, s7, 0x1ff +; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 +; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10 +; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe +; SI-GISEL-NEXT: s_or_b32 s6, s9, s6 +; SI-GISEL-NEXT: s_or_b32 s3, s4, s3 +; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0 +; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9 +; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12 +; SI-GISEL-NEXT: s_sub_i32 s9, 1, s5 +; SI-GISEL-NEXT: s_or_b32 s10, s4, 0x1000 +; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00 +; SI-GISEL-NEXT: s_or_b32 s4, s4, s8 +; SI-GISEL-NEXT: s_max_i32 s8, s9, 0 +; SI-GISEL-NEXT: s_min_i32 s8, s8, 13 +; SI-GISEL-NEXT: s_lshr_b32 s9, s10, s8 +; SI-GISEL-NEXT: s_lshl_b32 s8, s9, s8 +; SI-GISEL-NEXT: s_cmp_lg_u32 s8, s10 +; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s8, s9, s8 +; SI-GISEL-NEXT: s_cmp_lt_i32 s5, 1 +; SI-GISEL-NEXT: s_cselect_b32 s4, s8, s4 +; SI-GISEL-NEXT: s_and_b32 s8, s4, 7 +; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2 +; SI-GISEL-NEXT: s_cmp_eq_u32 s8, 3 +; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0 +; SI-GISEL-NEXT: s_cmp_gt_i32 s8, 5 +; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; SI-GISEL-NEXT: s_or_b32 s8, s9, s8 +; SI-GISEL-NEXT: s_add_i32 s4, s4, s8 +; SI-GISEL-NEXT: s_cmp_gt_i32 s5, 30 +; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4 +; SI-GISEL-NEXT: s_cmpk_eq_i32 s5, 0x40f +; SI-GISEL-NEXT: s_cselect_b32 s4, s6, s4 +; SI-GISEL-NEXT: s_lshr_b32 s5, s7, 16 +; SI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff +; SI-GISEL-NEXT: s_and_b32 s5, s5, 0x8000 +; SI-GISEL-NEXT: s_or_b32 s4, s5, s4 +; SI-GISEL-NEXT: s_and_b32 s4, s4, 0xffff +; SI-GISEL-NEXT: s_lshl_b32 s4, s4, 16 +; SI-GISEL-NEXT: s_or_b32 s4, s3, s4 +; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-GISEL-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: +; VI-SDAG: ; %bb.0: ; %entry +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_mov_b32 s10, s2 +; VI-SDAG-NEXT: s_mov_b32 s11, s3 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s8, s6 +; VI-SDAG-NEXT: s_mov_b32 s9, s7 +; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; VI-SDAG-NEXT: s_mov_b32 s0, s4 +; VI-SDAG-NEXT: s_mov_b32 s1, s5 +; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) +; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; VI-SDAG-NEXT: s_and_b32 s7, s4, 0x1ff +; VI-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; VI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; VI-SDAG-NEXT: s_lshr_b32 s8, s4, 8 +; VI-SDAG-NEXT: s_bfe_u32 s9, s4, 0xb0014 +; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; VI-SDAG-NEXT: s_or_b32 s7, s7, s8 +; VI-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; VI-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; VI-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; VI-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; VI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; VI-SDAG-NEXT: s_or_b32 s10, s7, s10 +; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; VI-SDAG-NEXT: s_and_b32 s10, s8, 7 +; VI-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; VI-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; VI-SDAG-NEXT: s_or_b32 s10, s10, s11 +; VI-SDAG-NEXT: s_add_i32 s8, s8, s10 +; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; VI-SDAG-NEXT: s_cselect_b32 s7, s6, 0x7c00 +; VI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; VI-SDAG-NEXT: s_and_b32 s8, s5, 0x1ff +; VI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16 +; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-SDAG-NEXT: s_lshr_b32 s9, s5, 8 +; VI-SDAG-NEXT: s_bfe_u32 s10, s5, 0xb0014 +; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; VI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; VI-SDAG-NEXT: s_or_b32 s4, s4, s7 +; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; VI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; VI-SDAG-NEXT: s_or_b32 s7, s8, s7 +; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; VI-SDAG-NEXT: s_lshl_b32 s4, s4, 16 +; VI-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; VI-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; VI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; VI-SDAG-NEXT: s_or_b32 s9, s7, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; VI-SDAG-NEXT: s_and_b32 s9, s8, 7 +; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; VI-SDAG-NEXT: s_or_b32 s9, s9, s11 +; VI-SDAG-NEXT: s_add_i32 s8, s8, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; VI-SDAG-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; VI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; VI-SDAG-NEXT: s_cselect_b32 s6, s6, s8 +; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; VI-SDAG-NEXT: s_or_b32 s5, s5, s6 +; VI-SDAG-NEXT: s_and_b32 s5, s5, 0xffff +; VI-SDAG-NEXT: s_or_b32 s4, s5, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; VI-GISEL: ; %bb.0: ; %entry +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 8 +; VI-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff +; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe +; VI-GISEL-NEXT: s_or_b32 s4, s8, s4 +; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s3, s3, s4 +; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2 +; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12 +; VI-GISEL-NEXT: s_max_i32 s9, s9, 0 +; VI-GISEL-NEXT: s_or_b32 s8, s3, s8 +; VI-GISEL-NEXT: s_min_i32 s9, s9, 13 +; VI-GISEL-NEXT: s_bitset1_b32 s3, 12 +; VI-GISEL-NEXT: s_lshl_b32 s4, s4, 9 +; VI-GISEL-NEXT: s_lshr_b32 s10, s3, s9 +; VI-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00 +; VI-GISEL-NEXT: s_lshl_b32 s9, s10, s9 +; VI-GISEL-NEXT: s_cmp_lg_u32 s9, s3 +; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s3, s10, s3 +; VI-GISEL-NEXT: s_cmp_lt_i32 s2, 1 +; VI-GISEL-NEXT: s_cselect_b32 s3, s3, s8 +; VI-GISEL-NEXT: s_and_b32 s8, s3, 7 +; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 2 +; VI-GISEL-NEXT: s_cmp_eq_u32 s8, 3 +; VI-GISEL-NEXT: s_cselect_b32 s9, 1, 0 +; VI-GISEL-NEXT: s_cmp_gt_i32 s8, 5 +; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s8, s9, s8 +; VI-GISEL-NEXT: s_add_i32 s3, s3, s8 +; VI-GISEL-NEXT: s_cmp_gt_i32 s2, 30 +; VI-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3 +; VI-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f +; VI-GISEL-NEXT: s_cselect_b32 s2, s4, s3 +; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 16 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 +; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff +; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10 +; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; VI-GISEL-NEXT: s_or_b32 s5, s5, s6 +; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s4, s4, s5 +; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3 +; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12 +; VI-GISEL-NEXT: s_max_i32 s8, s8, 0 +; VI-GISEL-NEXT: s_or_b32 s6, s4, s6 +; VI-GISEL-NEXT: s_min_i32 s8, s8, 13 +; VI-GISEL-NEXT: s_bitset1_b32 s4, 12 +; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; VI-GISEL-NEXT: s_lshr_b32 s9, s4, s8 +; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; VI-GISEL-NEXT: s_lshl_b32 s8, s9, s8 +; VI-GISEL-NEXT: s_cmp_lg_u32 s8, s4 +; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s4, s9, s4 +; VI-GISEL-NEXT: s_cmp_lt_i32 s3, 1 +; VI-GISEL-NEXT: s_cselect_b32 s4, s4, s6 +; VI-GISEL-NEXT: s_and_b32 s6, s4, 7 +; VI-GISEL-NEXT: s_lshr_b32 s4, s4, 2 +; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s6, s8, s6 +; VI-GISEL-NEXT: s_add_i32 s4, s4, s6 +; VI-GISEL-NEXT: s_cmp_gt_i32 s3, 30 +; VI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4 +; VI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f +; VI-GISEL-NEXT: s_cselect_b32 s3, s5, s4 +; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 16 +; VI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 +; VI-GISEL-NEXT: s_or_b32 s3, s4, s3 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff +; VI-GISEL-NEXT: s_and_b32 s2, s2, 0xffff +; VI-GISEL-NEXT: s_lshl_b32 s3, s3, 16 +; VI-GISEL-NEXT: s_or_b32 s2, s2, s3 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm +; +; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; GFX9-SDAG-NEXT: s_lshr_b32 s8, s5, 8 +; GFX9-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; GFX9-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-SDAG-NEXT: s_or_b32 s7, s7, s8 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; GFX9-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; GFX9-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX9-SDAG-NEXT: s_or_b32 s10, s7, s10 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_and_b32 s10, s8, 7 +; GFX9-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX9-SDAG-NEXT: s_or_b32 s10, s10, s11 +; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00 +; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: s_lshr_b32 s9, s6, 8 +; GFX9-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 +; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; GFX9-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; GFX9-SDAG-NEXT: s_or_b32 s5, s5, s7 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX9-SDAG-NEXT: s_or_b32 s7, s8, s7 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX9-SDAG-NEXT: s_or_b32 s9, s7, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; GFX9-SDAG-NEXT: s_and_b32 s9, s8, 7 +; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s11 +; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-SDAG-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 +; GFX9-SDAG-NEXT: s_or_b32 s4, s6, s4 +; GFX9-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: s_endpgm +; +; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 8 +; GFX9-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10 +; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe +; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2 +; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12 +; GFX9-GISEL-NEXT: s_max_i32 s9, s9, 0 +; GFX9-GISEL-NEXT: s_or_b32 s8, s3, s8 +; GFX9-GISEL-NEXT: s_min_i32 s9, s9, 13 +; GFX9-GISEL-NEXT: s_bitset1_b32 s3, 12 +; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 9 +; GFX9-GISEL-NEXT: s_lshr_b32 s10, s3, s9 +; GFX9-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00 +; GFX9-GISEL-NEXT: s_lshl_b32 s9, s10, s9 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s9, s3 +; GFX9-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s3, s10, s3 +; GFX9-GISEL-NEXT: s_cmp_lt_i32 s2, 1 +; GFX9-GISEL-NEXT: s_cselect_b32 s3, s3, s8 +; GFX9-GISEL-NEXT: s_and_b32 s8, s3, 7 +; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 2 +; GFX9-GISEL-NEXT: s_cmp_eq_u32 s8, 3 +; GFX9-GISEL-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-GISEL-NEXT: s_cmp_gt_i32 s8, 5 +; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s8, s9, s8 +; GFX9-GISEL-NEXT: s_add_i32 s3, s3, s8 +; GFX9-GISEL-NEXT: s_cmp_gt_i32 s2, 30 +; GFX9-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3 +; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX9-GISEL-NEXT: s_cselect_b32 s2, s4, s3 +; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 16 +; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2 +; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff +; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10 +; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3 +; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12 +; GFX9-GISEL-NEXT: s_max_i32 s8, s8, 0 +; GFX9-GISEL-NEXT: s_or_b32 s6, s4, s6 +; GFX9-GISEL-NEXT: s_min_i32 s8, s8, 13 +; GFX9-GISEL-NEXT: s_bitset1_b32 s4, 12 +; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; GFX9-GISEL-NEXT: s_lshr_b32 s9, s4, s8 +; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX9-GISEL-NEXT: s_lshl_b32 s8, s9, s8 +; GFX9-GISEL-NEXT: s_cmp_lg_u32 s8, s4 +; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s4, s9, s4 +; GFX9-GISEL-NEXT: s_cmp_lt_i32 s3, 1 +; GFX9-GISEL-NEXT: s_cselect_b32 s4, s4, s6 +; GFX9-GISEL-NEXT: s_and_b32 s6, s4, 7 +; GFX9-GISEL-NEXT: s_lshr_b32 s4, s4, 2 +; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX9-GISEL-NEXT: s_or_b32 s6, s8, s6 +; GFX9-GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX9-GISEL-NEXT: s_cmp_gt_i32 s3, 30 +; GFX9-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4 +; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX9-GISEL-NEXT: s_cselect_b32 s3, s5, s4 +; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 16 +; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX9-GISEL-NEXT: s_or_b32 s3, s4, s3 +; GFX9-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16: +; GFX950-SDAG: ; %bb.0: ; %entry +; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX950-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX950-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX950-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; GFX950-SDAG-NEXT: s_lshr_b32 s8, s5, 8 +; GFX950-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX950-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; GFX950-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX950-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; GFX950-SDAG-NEXT: s_or_b32 s7, s7, s8 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; GFX950-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX950-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; GFX950-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX950-SDAG-NEXT: s_or_b32 s10, s7, s10 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; GFX950-SDAG-NEXT: s_and_b32 s10, s8, 7 +; GFX950-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX950-SDAG-NEXT: s_or_b32 s10, s10, s11 +; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s10 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00 +; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; GFX950-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-SDAG-NEXT: s_lshr_b32 s9, s6, 8 +; GFX950-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 +; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; GFX950-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; GFX950-SDAG-NEXT: s_or_b32 s5, s5, s7 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX950-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX950-SDAG-NEXT: s_or_b32 s7, s8, s7 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX950-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX950-SDAG-NEXT: s_or_b32 s9, s7, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; GFX950-SDAG-NEXT: s_and_b32 s9, s8, 7 +; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s11 +; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s8 +; GFX950-SDAG-NEXT: s_lshr_b32 s6, s6, 16 +; GFX950-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 +; GFX950-SDAG-NEXT: s_or_b32 s4, s6, s4 +; GFX950-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; GFX950-GISEL: ; %bb.0: ; %entry +; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 8 +; GFX950-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10 +; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe +; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2 +; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12 +; GFX950-GISEL-NEXT: s_max_i32 s9, s9, 0 +; GFX950-GISEL-NEXT: s_or_b32 s8, s3, s8 +; GFX950-GISEL-NEXT: s_min_i32 s9, s9, 13 +; GFX950-GISEL-NEXT: s_bitset1_b32 s3, 12 +; GFX950-GISEL-NEXT: s_lshl_b32 s4, s4, 9 +; GFX950-GISEL-NEXT: s_lshr_b32 s10, s3, s9 +; GFX950-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00 +; GFX950-GISEL-NEXT: s_lshl_b32 s9, s10, s9 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s9, s3 +; GFX950-GISEL-NEXT: s_cselect_b32 s3, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s3, s10, s3 +; GFX950-GISEL-NEXT: s_cmp_lt_i32 s2, 1 +; GFX950-GISEL-NEXT: s_cselect_b32 s3, s3, s8 +; GFX950-GISEL-NEXT: s_and_b32 s8, s3, 7 +; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 2 +; GFX950-GISEL-NEXT: s_cmp_eq_u32 s8, 3 +; GFX950-GISEL-NEXT: s_cselect_b32 s9, 1, 0 +; GFX950-GISEL-NEXT: s_cmp_gt_i32 s8, 5 +; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s8, s9, s8 +; GFX950-GISEL-NEXT: s_add_i32 s3, s3, s8 +; GFX950-GISEL-NEXT: s_cmp_gt_i32 s2, 30 +; GFX950-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3 +; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX950-GISEL-NEXT: s_cselect_b32 s2, s4, s3 +; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 16 +; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2 +; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014 +; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8 +; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff +; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10 +; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe +; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3 +; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12 +; GFX950-GISEL-NEXT: s_max_i32 s8, s8, 0 +; GFX950-GISEL-NEXT: s_or_b32 s6, s4, s6 +; GFX950-GISEL-NEXT: s_min_i32 s8, s8, 13 +; GFX950-GISEL-NEXT: s_bitset1_b32 s4, 12 +; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; GFX950-GISEL-NEXT: s_lshr_b32 s9, s4, s8 +; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX950-GISEL-NEXT: s_lshl_b32 s8, s9, s8 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s8, s4 +; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s4, s9, s4 +; GFX950-GISEL-NEXT: s_cmp_lt_i32 s3, 1 +; GFX950-GISEL-NEXT: s_cselect_b32 s4, s4, s6 +; GFX950-GISEL-NEXT: s_and_b32 s6, s4, 7 +; GFX950-GISEL-NEXT: s_lshr_b32 s4, s4, 2 +; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0 +; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX950-GISEL-NEXT: s_or_b32 s6, s8, s6 +; GFX950-GISEL-NEXT: s_add_i32 s4, s4, s6 +; GFX950-GISEL-NEXT: s_cmp_gt_i32 s3, 30 +; GFX950-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4 +; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX950-GISEL-NEXT: s_cselect_b32 s3, s5, s4 +; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 16 +; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX950-GISEL-NEXT: s_or_b32 s3, s4, s3 +; GFX950-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX950-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, s4, 0, 13 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, s8, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s9, s5 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s4, 0x1ff +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s5, s4, 0xb0014 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s10, 0xffe +; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s9, 0x3f1, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, 0x1000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s12, s10, s11 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s11, s12, s11 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s11, s10 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s5, 0xfc10 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s12, s3 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s10, s5, 12 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, s10 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, s10 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s3, 7 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s10, 5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s10, 3 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s10, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s3, s3, s10 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 31 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s5, 0x40f +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s8, s3 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v3, s4, 0, 13 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, s8, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s9, s5 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s4, 0x1ff +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s5, s4, 0xb0014 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s10, 0xffe +; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s9, 0x3f1, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, 0x1000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s12, s10, s11 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s11, s12, s11 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s11, s10 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s5, 0xfc10 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s12, s3 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s10, s5, 12 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, s10 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, s10 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s3, 7 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s10, 5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s10, 3 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s10, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s3, s3, s10 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 31 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s5, 0x40f +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s8, s3 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000 +; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s8, s8, 0 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s9, s2, 12 +; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s8, s8, 13 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s4, s4, 9 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s11, s10, s8 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s9 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s11, s8 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s4, 0x7c00 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s8, s10 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s11, s8 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s8, s3 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s3, 7 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s8 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s2, 30 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3 +; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000 +; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s4, 12 +; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s10, s9, s6 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s8 +; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s10, s6 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s9 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s10, s6 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s6, s3 +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 7 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s6 +; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30 +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3 +; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s5, s3 +; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s4, s7, 16 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4 +; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000 +; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s8, s8, 0 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s9, s2, 12 +; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s8, s8, 13 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s4, s4, 9 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s11, s10, s8 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s9 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s11, s8 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s4, 0x7c00 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s8, s10 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s11, s8 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s8, s3 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s3, 7 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s8 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s2, 30 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3 +; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff +; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000 +; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s4, 12 +; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s10, s9, s6 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s8 +; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s10, s6 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s9 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s10, s6 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s6, s3 +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 7 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6 +; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s6 +; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30 +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3 +; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s5, s3 +; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s4, s7, 16 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm + ptr addrspace(1) %r, + ptr addrspace(1) %a) { +entry: + %a.val = load <2 x double>, ptr addrspace(1) %a + %r.val = fptrunc <2 x double> %a.val to <2 x half> + store <2 x half> %r.val, ptr addrspace(1) %r + ret void +} + +define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( +; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: +; SI-SDAG: ; %bb.0: ; %entry +; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s2, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, s2 +; SI-SDAG-NEXT: s_mov_b32 s11, s3 +; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SI-SDAG-NEXT: s_mov_b32 s8, s6 +; SI-SDAG-NEXT: s_mov_b32 s9, s7 +; SI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00 +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; SI-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; SI-SDAG-NEXT: s_and_b32 s7, s1, 0x1ff +; SI-SDAG-NEXT: s_lshr_b32 s8, s1, 8 +; SI-SDAG-NEXT: s_bfe_u32 s9, s1, 0xb0014 +; SI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; SI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; SI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; SI-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; SI-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; SI-SDAG-NEXT: s_or_b32 s7, s7, s8 +; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; SI-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; SI-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; SI-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; SI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; SI-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; SI-SDAG-NEXT: s_or_b32 s10, s7, s10 +; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; SI-SDAG-NEXT: s_and_b32 s10, s8, 7 +; SI-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; SI-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; SI-SDAG-NEXT: s_or_b32 s10, s10, s11 +; SI-SDAG-NEXT: s_add_i32 s8, s8, s10 +; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; SI-SDAG-NEXT: s_cselect_b32 s7, s0, 0x7c00 +; SI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16 +; SI-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff +; SI-SDAG-NEXT: s_lshr_b32 s9, s6, 8 +; SI-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 +; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; SI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; SI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; SI-SDAG-NEXT: s_or_b32 s1, s1, s7 +; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; SI-SDAG-NEXT: s_lshl_b32 s1, s1, 16 +; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; SI-SDAG-NEXT: s_or_b32 s7, s8, s7 +; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; SI-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; SI-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; SI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; SI-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; SI-SDAG-NEXT: s_or_b32 s9, s7, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; SI-SDAG-NEXT: s_and_b32 s9, s8, 7 +; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; SI-SDAG-NEXT: s_or_b32 s9, s9, s11 +; SI-SDAG-NEXT: s_add_i32 s8, s8, s9 +; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00 +; SI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s8 +; SI-SDAG-NEXT: s_lshr_b32 s6, s6, 16 +; SI-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 +; SI-SDAG-NEXT: s_or_b32 s0, s6, s0 +; SI-SDAG-NEXT: s_and_b32 s0, s0, 0xffff +; SI-SDAG-NEXT: s_or_b32 s6, s0, s1 +; SI-SDAG-NEXT: s_mov_b32 s0, s4 +; SI-SDAG-NEXT: s_mov_b32 s1, s5 +; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: s_endpgm +; +; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: +; SI-GISEL: ; %bb.0: ; %entry +; SI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; SI-GISEL-NEXT: s_mov_b32 s2, -1 +; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] ; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] ; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -664,29 +3145,111 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-GISEL-NEXT: s_endpgm ; -; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16: +; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s6, -1 -; VI-SDAG-NEXT: s_mov_b32 s10, s6 -; VI-SDAG-NEXT: s_mov_b32 s11, s7 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_mov_b32 s10, s2 +; VI-SDAG-NEXT: s_mov_b32 s11, s3 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_mov_b32 s8, s2 -; VI-SDAG-NEXT: s_mov_b32 s9, s3 +; VI-SDAG-NEXT: s_mov_b32 s8, s6 +; VI-SDAG-NEXT: s_mov_b32 s9, s7 ; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-SDAG-NEXT: s_mov_b32 s4, s0 -; VI-SDAG-NEXT: s_mov_b32 s5, s1 +; VI-SDAG-NEXT: s_mov_b32 s0, s4 +; VI-SDAG-NEXT: s_mov_b32 s1, s5 +; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00 ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v3 +; VI-SDAG-NEXT: s_and_b32 s7, s4, 0x1ff +; VI-SDAG-NEXT: v_readfirstlane_b32 s5, v1 +; VI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; VI-SDAG-NEXT: s_lshr_b32 s8, s4, 8 +; VI-SDAG-NEXT: s_bfe_u32 s9, s4, 0xb0014 +; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; VI-SDAG-NEXT: s_or_b32 s7, s7, s8 +; VI-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; VI-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; VI-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; VI-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; VI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; VI-SDAG-NEXT: s_or_b32 s10, s7, s10 +; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; VI-SDAG-NEXT: s_and_b32 s10, s8, 7 +; VI-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; VI-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; VI-SDAG-NEXT: s_or_b32 s10, s10, s11 +; VI-SDAG-NEXT: s_add_i32 s8, s8, s10 +; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; VI-SDAG-NEXT: s_cselect_b32 s7, s6, 0x7c00 +; VI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; VI-SDAG-NEXT: s_and_b32 s8, s5, 0x1ff +; VI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16 +; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-SDAG-NEXT: s_lshr_b32 s9, s5, 8 +; VI-SDAG-NEXT: s_bfe_u32 s10, s5, 0xb0014 +; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; VI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; VI-SDAG-NEXT: s_or_b32 s4, s4, s7 +; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; VI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; VI-SDAG-NEXT: s_or_b32 s7, s8, s7 +; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; VI-SDAG-NEXT: s_lshl_b32 s4, s4, 16 +; VI-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; VI-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; VI-SDAG-NEXT: s_or_b32 s8, s11, s8 +; VI-SDAG-NEXT: s_or_b32 s9, s7, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; VI-SDAG-NEXT: s_and_b32 s9, s8, 7 +; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; VI-SDAG-NEXT: s_or_b32 s9, s9, s11 +; VI-SDAG-NEXT: s_add_i32 s8, s8, s9 +; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; VI-SDAG-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; VI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; VI-SDAG-NEXT: s_cselect_b32 s6, s6, s8 +; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; VI-SDAG-NEXT: s_or_b32 s5, s5, s6 +; VI-SDAG-NEXT: s_and_b32 s5, s5, 0xffff +; VI-SDAG-NEXT: s_or_b32 s4, s5, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-SDAG-NEXT: s_endpgm ; -; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: ; VI-GISEL: ; %bb.0: ; %entry ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -702,29 +3265,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16: +; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s7, s3 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX9-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX9-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX9-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; GFX9-SDAG-NEXT: s_lshr_b32 s8, s5, 8 +; GFX9-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; GFX9-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; GFX9-SDAG-NEXT: s_or_b32 s7, s7, s8 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; GFX9-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; GFX9-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX9-SDAG-NEXT: s_or_b32 s10, s7, s10 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_and_b32 s10, s8, 7 +; GFX9-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX9-SDAG-NEXT: s_or_b32 s10, s10, s11 +; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s10 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00 +; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-SDAG-NEXT: s_lshr_b32 s9, s6, 8 +; GFX9-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 +; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; GFX9-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; GFX9-SDAG-NEXT: s_or_b32 s5, s5, s7 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX9-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX9-SDAG-NEXT: s_or_b32 s7, s8, s7 +; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX9-SDAG-NEXT: s_or_b32 s9, s7, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; GFX9-SDAG-NEXT: s_and_b32 s9, s8, 7 +; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s11 +; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s9 +; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-SDAG-NEXT: s_lshr_b32 s6, s6, 16 +; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 +; GFX9-SDAG-NEXT: s_or_b32 s4, s6, s4 +; GFX9-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-SDAG-NEXT: s_endpgm ; -; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX9-GISEL: ; %bb.0: ; %entry ; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -740,27 +3383,109 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16: +; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX950-SDAG: ; %bb.0: ; %entry -; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 -; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s6, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s7, s3 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s10 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s11 +; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s0, s8 +; GFX950-SDAG-NEXT: s_mov_b32 s1, s9 +; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2 -; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v3 +; GFX950-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX950-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 +; GFX950-SDAG-NEXT: s_lshr_b32 s8, s5, 8 +; GFX950-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX950-SDAG-NEXT: s_and_b32 s7, s8, 0xffe +; GFX950-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX950-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s8, v1 +; GFX950-SDAG-NEXT: s_or_b32 s7, s7, s8 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s10, v2 +; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s10 +; GFX950-SDAG-NEXT: s_lshl_b32 s10, s11, s10 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s10, s8 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX950-SDAG-NEXT: s_addk_i32 s9, 0xfc10 +; GFX950-SDAG-NEXT: s_lshl_b32 s10, s9, 12 +; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX950-SDAG-NEXT: s_or_b32 s10, s7, s10 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 1 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s10 +; GFX950-SDAG-NEXT: s_and_b32 s10, s8, 7 +; GFX950-SDAG-NEXT: s_cmp_gt_i32 s10, 5 +; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s10, 3 +; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0 +; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX950-SDAG-NEXT: s_or_b32 s10, s10, s11 +; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s10 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 31 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00 +; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f +; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s8 +; GFX950-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX950-SDAG-NEXT: s_lshr_b32 s9, s6, 8 +; GFX950-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 +; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-SDAG-NEXT: s_and_b32 s8, s9, 0xffe +; GFX950-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 +; GFX950-SDAG-NEXT: s_or_b32 s5, s5, s7 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0 +; GFX950-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX950-SDAG-NEXT: s_or_b32 s7, s8, s7 +; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1 +; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 +; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s9 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s11, s9 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s8 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; GFX950-SDAG-NEXT: s_addk_i32 s10, 0xfc10 +; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, 12 +; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8 +; GFX950-SDAG-NEXT: s_or_b32 s9, s7, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 1 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s9 +; GFX950-SDAG-NEXT: s_and_b32 s9, s8, 7 +; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5 +; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0 +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3 +; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2 +; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s11 +; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s9 +; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 31 +; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f +; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s8 +; GFX950-SDAG-NEXT: s_lshr_b32 s6, s6, 16 +; GFX950-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 +; GFX950-SDAG-NEXT: s_or_b32 s4, s6, s4 +; GFX950-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-SDAG-NEXT: s_endpgm ; -; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16: +; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX950-GISEL: ; %bb.0: ; %entry ; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) @@ -776,7 +3501,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 @@ -786,21 +3511,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 ; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 -; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, v[0:1] -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, s4, 0, 13 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, s8, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s9, s5 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s4, 0x1ff +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s5, s4, 0xb0014 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s10, 0xffe +; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s9, 0x3f1, s5 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, 0x1000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s12, s10, s11 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s11, s12, s11 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s11, s10 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s5, 0xfc10 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s12, s3 +; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s10, s5, 12 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, s10 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, s10 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s3, 7 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s10, 5 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s10, 3 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s10, 1, 0 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s3, s3, s10 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 31 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s5, 0x40f +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s8, s3 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; -; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 @@ -810,21 +3627,113 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 ; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 -; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, s3, v2 +; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v3, s4, 0, 13 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v3 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31 +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, s8, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s9, s5 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s4, 0x1ff +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s10, s4, 8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s5, s4, 0xb0014 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s10, 0xffe +; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s9, 0x3f1, s5 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s9, 0, 13 +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s9, v0 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s9, s10, s9 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, 0x1000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s12, s10, s11 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s11, s12, s11 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s11, s10 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s5, 0xfc10 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s12, s3 +; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s10, s5, 12 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, s10 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, s10 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s3, 7 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s10, 5 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s11, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s10, 3 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s10, 1, 0 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s10, s11 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s3, s3, s10 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 31 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, s8, 0x7c00 +; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s5, 0x40f +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s8, s3 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s4, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; -; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -842,7 +3751,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; -; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry ; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -863,7 +3772,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ptr addrspace(1) %a) { entry: %a.val = load <2 x double>, ptr addrspace(1) %a - %r.val = fptrunc <2 x double> %a.val to <2 x half> + %r.val = fptrunc afn <2 x double> %a.val to <2 x half> store <2 x half> %r.val, ptr addrspace(1) %r ret void } diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 2bd3659..4f8eab1 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -3,17 +3,15 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-GISEL,VI-UNSAFE-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-UNSAFE-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f32: @@ -94,6 +92,85 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) ret void } +define amdgpu_kernel void @fptrunc_f64_to_f32_afn(ptr addrspace(1) %out, double %in) { +; SI-LABEL: fptrunc_f64_to_f32_afn: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_f64_to_f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s6, -1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-SDAG-NEXT: s_mov_b32 s4, s0 +; VI-SDAG-NEXT: s_mov_b32 s5, s1 +; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_f64_to_f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: fptrunc_f64_to_f32_afn: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fptrunc_f64_to_f32_afn: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fptrunc_f64_to_f32_afn: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fptrunc_f64_to_f32_afn: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_endpgm + %result = fptrunc afn double %in to float + store float %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f16: ; SI: ; %bb.0: @@ -203,56 +280,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-SAFE-SDAG-NEXT: s_endpgm ; -; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: -; VI-SAFE-GISEL: ; %bb.0: -; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 -; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff -; VI-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10 -; VI-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2 -; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2 -; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; VI-SAFE-GISEL-NEXT: s_sub_i32 s7, 1, s4 -; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s4, 12 -; VI-SAFE-GISEL-NEXT: s_max_i32 s7, s7, 0 -; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s2, s6 -; VI-SAFE-GISEL-NEXT: s_min_i32 s7, s7, 13 -; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s2, 12 -; VI-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9 -; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s2, s7 -; VI-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 -; VI-SAFE-GISEL-NEXT: s_lshl_b32 s7, s8, s7 -; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s7, s2 -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s8, s2 -; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1 -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s2, s6 -; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7 -; VI-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2 -; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3 -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0 -; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5 -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0 -; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6 -; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6 -; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30 -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 -; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f -; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2 -; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16 -; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 -; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2 -; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-SAFE-GISEL-NEXT: s_mov_b32 s2, -1 -; VI-SAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 -; VI-SAFE-GISEL-NEXT: s_endpgm +; VI-GISEL-LABEL: fptrunc_f64_to_f16: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; VI-GISEL-NEXT: s_lshr_b32 s5, s3, 8 +; VI-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10 +; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe +; VI-GISEL-NEXT: s_or_b32 s2, s6, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4 +; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12 +; VI-GISEL-NEXT: s_max_i32 s7, s7, 0 +; VI-GISEL-NEXT: s_or_b32 s6, s2, s6 +; VI-GISEL-NEXT: s_min_i32 s7, s7, 13 +; VI-GISEL-NEXT: s_bitset1_b32 s2, 12 +; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; VI-GISEL-NEXT: s_lshr_b32 s8, s2, s7 +; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; VI-GISEL-NEXT: s_lshl_b32 s7, s8, s7 +; VI-GISEL-NEXT: s_cmp_lg_u32 s7, s2 +; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s2, s8, s2 +; VI-GISEL-NEXT: s_cmp_lt_i32 s4, 1 +; VI-GISEL-NEXT: s_cselect_b32 s2, s2, s6 +; VI-GISEL-NEXT: s_and_b32 s6, s2, 7 +; VI-GISEL-NEXT: s_lshr_b32 s2, s2, 2 +; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; VI-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; VI-GISEL-NEXT: s_or_b32 s6, s7, s6 +; VI-GISEL-NEXT: s_add_i32 s2, s2, s6 +; VI-GISEL-NEXT: s_cmp_gt_i32 s4, 30 +; VI-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; VI-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f +; VI-GISEL-NEXT: s_cselect_b32 s2, s5, s2 +; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 16 +; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; VI-GISEL-NEXT: s_or_b32 s2, s3, s2 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm ; ; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; VI-UNSAFE-SDAG: ; %bb.0: @@ -265,17 +342,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-UNSAFE-SDAG-NEXT: s_endpgm ; -; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: -; VI-UNSAFE-GISEL: ; %bb.0: -; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 -; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000 -; VI-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 -; VI-UNSAFE-GISEL-NEXT: s_endpgm -; ; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-SAFE-SDAG: ; %bb.0: ; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 @@ -328,56 +394,56 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-SAFE-SDAG-NEXT: s_endpgm ; -; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: -; GFX10-SAFE-GISEL: ; %bb.0: -; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff -; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 -; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10 -; GFX10-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 -; GFX10-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0 -; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12 -; GFX10-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13 -; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9 -; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7 -; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 -; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8 -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6 -; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1 -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7 -; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2 -; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0 -; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5 -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6 -; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6 -; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30 -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 -; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f -; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2 -; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX10-SAFE-GISEL-NEXT: s_endpgm +; GFX10-GISEL-LABEL: fptrunc_f64_to_f16: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8 +; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10 +; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe +; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4 +; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 +; GFX10-GISEL-NEXT: s_max_i32 s6, s6, 0 +; GFX10-GISEL-NEXT: s_lshl_b32 s7, s4, 12 +; GFX10-GISEL-NEXT: s_min_i32 s6, s6, 13 +; GFX10-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; GFX10-GISEL-NEXT: s_lshr_b32 s9, s8, s6 +; GFX10-GISEL-NEXT: s_or_b32 s2, s2, s7 +; GFX10-GISEL-NEXT: s_lshl_b32 s6, s9, s6 +; GFX10-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX10-GISEL-NEXT: s_cmp_lg_u32 s6, s8 +; GFX10-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-GISEL-NEXT: s_or_b32 s6, s9, s6 +; GFX10-GISEL-NEXT: s_cmp_lt_i32 s4, 1 +; GFX10-GISEL-NEXT: s_cselect_b32 s2, s6, s2 +; GFX10-GISEL-NEXT: s_and_b32 s6, s2, 7 +; GFX10-GISEL-NEXT: s_lshr_b32 s2, s2, 2 +; GFX10-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; GFX10-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-GISEL-NEXT: s_or_b32 s6, s7, s6 +; GFX10-GISEL-NEXT: s_add_i32 s2, s2, s6 +; GFX10-GISEL-NEXT: s_cmp_gt_i32 s4, 30 +; GFX10-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; GFX10-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX10-GISEL-NEXT: s_cselect_b32 s2, s5, s2 +; GFX10-GISEL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX10-GISEL-NEXT: s_or_b32 s2, s3, s2 +; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-GISEL-NEXT: s_endpgm ; ; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX10-UNSAFE-SDAG: ; %bb.0: @@ -390,17 +456,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-UNSAFE-SDAG-NEXT: s_endpgm ; -; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: -; GFX10-UNSAFE-GISEL: ; %bb.0: -; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX10-UNSAFE-GISEL-NEXT: s_endpgm -; ; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: ; GFX11-SAFE-SDAG: ; %bb.0: ; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 @@ -461,62 +516,368 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-SAFE-SDAG-NEXT: s_endpgm ; -; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16: +; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff +; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 +; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2 +; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10 +; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4 +; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 +; GFX11-GISEL-NEXT: s_max_i32 s6, s6, 0 +; GFX11-GISEL-NEXT: s_lshl_b32 s7, s4, 12 +; GFX11-GISEL-NEXT: s_min_i32 s6, s6, 13 +; GFX11-GISEL-NEXT: s_lshl_b32 s5, s5, 9 +; GFX11-GISEL-NEXT: s_lshr_b32 s9, s8, s6 +; GFX11-GISEL-NEXT: s_or_b32 s2, s2, s7 +; GFX11-GISEL-NEXT: s_lshl_b32 s6, s9, s6 +; GFX11-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 +; GFX11-GISEL-NEXT: s_cmp_lg_u32 s6, s8 +; GFX11-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_or_b32 s6, s9, s6 +; GFX11-GISEL-NEXT: s_cmp_lt_i32 s4, 1 +; GFX11-GISEL-NEXT: s_cselect_b32 s2, s6, s2 +; GFX11-GISEL-NEXT: s_and_b32 s6, s2, 7 +; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 2 +; GFX11-GISEL-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-GISEL-NEXT: s_cselect_b32 s7, 1, 0 +; GFX11-GISEL-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-GISEL-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_or_b32 s6, s7, s6 +; GFX11-GISEL-NEXT: s_add_i32 s2, s2, s6 +; GFX11-GISEL-NEXT: s_cmp_gt_i32 s4, 30 +; GFX11-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 +; GFX11-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f +; GFX11-GISEL-NEXT: s_cselect_b32 s2, s5, s2 +; GFX11-GISEL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-GISEL-NEXT: s_or_b32 s2, s3, s2 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0: +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm +; +; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0: +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm + %result = fptrunc double %in to half + %result_i16 = bitcast half %result to i16 + store i16 %result_i16, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptrunc_f64_to_f16_afn(ptr addrspace(1) %out, double %in) { +; SI-LABEL: fptrunc_f64_to_f16_afn: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_movk_i32 s2, 0x7e00 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s0, s7, 8 +; SI-NEXT: s_and_b32 s1, s7, 0x1ff +; SI-NEXT: s_and_b32 s8, s0, 0xffe +; SI-NEXT: s_or_b32 s0, s1, s6 +; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 +; SI-NEXT: v_readfirstlane_b32 s1, v0 +; SI-NEXT: s_sub_i32 s6, 0x3f1, s0 +; SI-NEXT: s_or_b32 s1, s8, s1 +; SI-NEXT: v_med3_i32 v0, s6, 0, 13 +; SI-NEXT: s_or_b32 s6, s1, 0x1000 +; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: s_lshr_b32 s9, s6, s8 +; SI-NEXT: s_lshl_b32 s8, s9, s8 +; SI-NEXT: s_cmp_lg_u32 s8, s6 +; SI-NEXT: s_cselect_b32 s6, 1, 0 +; SI-NEXT: s_addk_i32 s0, 0xfc10 +; SI-NEXT: s_or_b32 s6, s9, s6 +; SI-NEXT: s_lshl_b32 s8, s0, 12 +; SI-NEXT: s_or_b32 s8, s1, s8 +; SI-NEXT: s_cmp_lt_i32 s0, 1 +; SI-NEXT: s_cselect_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s6, 7 +; SI-NEXT: s_cmp_gt_i32 s8, 5 +; SI-NEXT: s_cselect_b32 s9, 1, 0 +; SI-NEXT: s_cmp_eq_u32 s8, 3 +; SI-NEXT: s_cselect_b32 s8, 1, 0 +; SI-NEXT: s_lshr_b32 s6, s6, 2 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_add_i32 s6, s6, s8 +; SI-NEXT: s_cmp_lt_i32 s0, 31 +; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 +; SI-NEXT: s_cmp_lg_u32 s1, 0 +; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 +; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f +; SI-NEXT: s_cselect_b32 s0, s1, s6 +; SI-NEXT: s_lshr_b32 s1, s7, 16 +; SI-NEXT: s_and_b32 s1, s1, 0x8000 +; SI-NEXT: s_or_b32 s6, s1, s0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; VI-SAFE-SDAG: ; %bb.0: +; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8 +; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe +; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff +; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5 +; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014 +; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 +; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6 +; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 +; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8 +; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10 +; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12 +; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5 +; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8 +; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7 +; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8 +; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f +; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5 +; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16 +; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4 +; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-SAFE-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm +; +; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; VI-UNSAFE-SDAG: ; %bb.0: +; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-UNSAFE-SDAG-NEXT: s_endpgm +; +; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; GFX10-SAFE-SDAG: ; %bb.0: +; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 +; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 +; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 +; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 +; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 +; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 +; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 +; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 +; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 +; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-SAFE-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fptrunc_f64_to_f16_afn: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; GFX10-UNSAFE-SDAG: ; %bb.0: +; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-UNSAFE-SDAG-NEXT: s_endpgm +; +; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; GFX11-SAFE-SDAG: ; %bb.0: +; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 +; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 +; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 +; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 +; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 +; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 +; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 +; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-SAFE-SDAG-NEXT: s_endpgm +; +; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16_afn: ; GFX11-SAFE-GISEL: ; %bb.0: ; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff -; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014 -; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2 -; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10 -; GFX11-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0 -; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4 -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000 -; GFX11-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0 -; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12 -; GFX11-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13 -; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9 -; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6 -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7 -; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6 -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00 -; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8 -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6 -; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1 -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2 -; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7 -; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2 -; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0 -; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6 -; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6 -; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30 -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2 -; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f -; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2 -; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2 -; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-SAFE-GISEL-NEXT: s_endpgm ; -; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0: ; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -528,7 +889,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm ; -; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0: ; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -540,7 +901,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm ; -; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX11-UNSAFE-GISEL-TRUE16: ; %bb.0: ; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) @@ -552,7 +913,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-UNSAFE-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_endpgm ; -; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn: ; GFX11-UNSAFE-GISEL-FAKE16: ; %bb.0: ; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) @@ -563,7 +924,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-UNSAFE-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_endpgm - %result = fptrunc double %in to half + %result = fptrunc afn double %in to half %result_i16 = bitcast half %result to i16 store i16 %result_i16, ptr addrspace(1) %out ret void @@ -662,6 +1023,99 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do ret void } +define amdgpu_kernel void @fptrunc_v2f64_to_v2f32_afn(ptr addrspace(1) %out, <2 x double> %in) { +; SI-LABEL: fptrunc_v2f64_to_v2f32_afn: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s6, -1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; VI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s6, -1 +; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32_afn: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32_afn: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_endpgm + %result = fptrunc afn <2 x double> %in to <2 x float> + store <2 x float> %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x double> %in) { ; SI-LABEL: fptrunc_v3f64_to_v3f32: ; SI: ; %bb.0: @@ -769,6 +1223,113 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do ret void } +define amdgpu_kernel void @fptrunc_v3f64_to_v3f32_afn(ptr addrspace(1) %out, <3 x double> %in) { +; SI-LABEL: fptrunc_v3f64_to_v3f32_afn: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x15 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[4:5] +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s6, -1 +; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x54 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x44 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32_afn: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x54 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x44 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[6:7] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[2:3] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[0:1] +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32_afn: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_endpgm + %result = fptrunc afn <3 x double> %in to <3 x float> + store <3 x float> %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x double> %in) { ; SI-LABEL: fptrunc_v4f64_to_v4f32: ; SI: ; %bb.0: @@ -876,6 +1437,113 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do ret void } +define amdgpu_kernel void @fptrunc_v4f64_to_v4f32_afn(ptr addrspace(1) %out, <4 x double> %in) { +; SI-LABEL: fptrunc_v4f64_to_v4f32_afn: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32_afn: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32_afn: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b256 s[8:15], s[4:5], 0x44 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-GISEL-NEXT: s_endpgm + %result = fptrunc afn <4 x double> %in to <4 x float> + store <4 x float> %result, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x double> %in) { ; SI-LABEL: fptrunc_v8f64_to_v8f32: ; SI: ; %bb.0: @@ -1019,3 +1687,150 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do store <8 x float> %result, ptr addrspace(1) %out ret void } + +define amdgpu_kernel void @fptrunc_v8f64_to_v8f32_afn(ptr addrspace(1) %out, <8 x double> %in) { +; SI-LABEL: fptrunc_v8f64_to_v8f32_afn: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; SI-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; SI-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; SI-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; SI-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; SI-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; SI-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-SDAG-NEXT: s_endpgm +; +; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-GISEL-NEXT: s_mov_b32 s2, -1 +; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000 +; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-GISEL-NEXT: s_endpgm +; +; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32_afn: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 +; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32_afn: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0x64 +; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[8:9] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[10:11] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[12:13] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[14:15] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v4, s[16:17] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[18:19] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[20:21] +; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[22:23] +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16 +; GFX11-GISEL-NEXT: s_endpgm + %result = fptrunc <8 x double> %in to <8 x float> + store <8 x float> %result, ptr addrspace(1) %out + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10-SAFE-GISEL: {{.*}} +; VI-SAFE-GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 7d85d34..beda16c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -1,13 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-SDAG,GFX942-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-GISEL,GFX942-AGPRCD-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-SDAG,GFX950-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-GISEL,GFX950-AGPRCD-GISEL %s declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index d358837..8081a15 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -252,62 +252,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a31, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a30, s22 -; GCN-NEXT: v_accvgpr_write_b32 a29, s21 -; GCN-NEXT: v_accvgpr_write_b32 a28, s20 -; GCN-NEXT: v_accvgpr_write_b32 a27, s19 -; GCN-NEXT: v_accvgpr_write_b32 a26, s18 -; GCN-NEXT: v_accvgpr_write_b32 a25, s17 -; GCN-NEXT: v_accvgpr_write_b32 a24, s16 -; GCN-NEXT: v_accvgpr_write_b32 a23, s15 -; GCN-NEXT: v_accvgpr_write_b32 a22, s14 -; GCN-NEXT: v_accvgpr_write_b32 a21, s13 -; GCN-NEXT: v_accvgpr_write_b32 a20, s12 -; GCN-NEXT: v_accvgpr_write_b32 a19, s11 -; GCN-NEXT: v_accvgpr_write_b32 a18, s10 -; GCN-NEXT: v_accvgpr_write_b32 a17, s9 -; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] -; GCN-NEXT: v_mov_b32_e32 v12, s22 -; GCN-NEXT: v_mov_b32_e32 v13, s23 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v40, s20 +; GCN-NEXT: v_mov_b32_e32 v41, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] +; GCN-NEXT: v_mov_b32_e32 v42, s22 +; GCN-NEXT: v_mov_b32_e32 v43, s23 +; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_mov_b32_e32 v16, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NEXT: v_mov_b32_e32 v18, s14 +; GCN-NEXT: v_mov_b32_e32 v19, s15 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) @@ -322,62 +315,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa ; GCN-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v44, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a31, s23 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a30, s22 -; GCN-NEXT: v_accvgpr_write_b32 a29, s21 -; GCN-NEXT: v_accvgpr_write_b32 a28, s20 -; GCN-NEXT: v_accvgpr_write_b32 a27, s19 -; GCN-NEXT: v_accvgpr_write_b32 a26, s18 -; GCN-NEXT: v_accvgpr_write_b32 a25, s17 -; GCN-NEXT: v_accvgpr_write_b32 a24, s16 -; GCN-NEXT: v_accvgpr_write_b32 a23, s15 -; GCN-NEXT: v_accvgpr_write_b32 a22, s14 -; GCN-NEXT: v_accvgpr_write_b32 a21, s13 -; GCN-NEXT: v_accvgpr_write_b32 a20, s12 -; GCN-NEXT: v_accvgpr_write_b32 a19, s11 -; GCN-NEXT: v_accvgpr_write_b32 a18, s10 -; GCN-NEXT: v_accvgpr_write_b32 a17, s9 -; GCN-NEXT: v_accvgpr_write_b32 a16, s8 -; GCN-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; GCN-NEXT: v_mov_b32_e32 v12, s22 -; GCN-NEXT: v_mov_b32_e32 v13, s23 -; GCN-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GCN-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v40, s20 +; GCN-NEXT: v_mov_b32_e32 v41, s21 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; GCN-NEXT: v_mov_b32_e32 v42, s22 +; GCN-NEXT: v_mov_b32_e32 v43, s23 +; GCN-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_mov_b32_e32 v16, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s17 +; GCN-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s12 +; GCN-NEXT: v_mov_b32_e32 v17, s13 +; GCN-NEXT: v_mov_b32_e32 v18, s14 +; GCN-NEXT: v_mov_b32_e32 v19, s15 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: v_mov_b32_e32 v16, s8 +; GCN-NEXT: v_mov_b32_e32 v17, s9 +; GCN-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NEXT: v_mov_b32_e32 v19, s11 +; GCN-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) @@ -393,35 +379,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a2, s10 -; GCN-NEXT: v_accvgpr_write_b32 a3, s11 -; GCN-NEXT: v_accvgpr_write_b32 a4, s12 -; GCN-NEXT: v_accvgpr_write_b32 a5, s13 -; GCN-NEXT: v_accvgpr_write_b32 a6, s14 -; GCN-NEXT: v_accvgpr_write_b32 a7, s15 -; GCN-NEXT: v_accvgpr_write_b32 a8, s16 -; GCN-NEXT: v_accvgpr_write_b32 a9, s17 -; GCN-NEXT: v_accvgpr_write_b32 a10, s18 -; GCN-NEXT: v_accvgpr_write_b32 a11, s19 -; GCN-NEXT: v_accvgpr_write_b32 a12, s20 -; GCN-NEXT: v_accvgpr_write_b32 a13, s21 -; GCN-NEXT: v_accvgpr_write_b32 a14, s22 -; GCN-NEXT: v_accvgpr_write_b32 a15, s23 +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out @@ -435,40 +413,32 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GCN-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GCN-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GCN-NEXT: v_accvgpr_write_b32 a0, s8 -; GCN-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GCN-NEXT: v_accvgpr_write_b32 a1, s9 -; GCN-NEXT: v_accvgpr_write_b32 a2, s10 -; GCN-NEXT: v_accvgpr_write_b32 a3, s11 -; GCN-NEXT: v_accvgpr_write_b32 a4, s12 -; GCN-NEXT: v_accvgpr_write_b32 a5, s13 -; GCN-NEXT: v_accvgpr_write_b32 a6, s14 -; GCN-NEXT: v_accvgpr_write_b32 a7, s15 -; GCN-NEXT: v_accvgpr_write_b32 a8, s16 -; GCN-NEXT: v_accvgpr_write_b32 a9, s17 -; GCN-NEXT: v_accvgpr_write_b32 a10, s18 -; GCN-NEXT: v_accvgpr_write_b32 a11, s19 -; GCN-NEXT: v_accvgpr_write_b32 a12, s20 -; GCN-NEXT: v_accvgpr_write_b32 a13, s21 -; GCN-NEXT: v_accvgpr_write_b32 a14, s22 -; GCN-NEXT: v_accvgpr_write_b32 a15, s23 +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_nop 7 ; GCN-NEXT: s_nop 2 -; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GCN-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 21465be..d81ec1c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: @@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: @@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v44, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v44, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v10, s20 -; HEURRC-NEXT: v_mov_b32_e32 v11, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v12, s22 -; HEURRC-NEXT: v_mov_b32_e32 v13, s23 -; HEURRC-NEXT: v_mov_b32_e32 v0, s16 -; HEURRC-NEXT: v_mov_b32_e32 v1, s17 -; HEURRC-NEXT: v_mov_b32_e32 v2, s18 -; HEURRC-NEXT: v_mov_b32_e32 v3, s19 -; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_mov_b32_e32 v40, s20 +; HEURRC-NEXT: v_mov_b32_e32 v41, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v42, s22 +; HEURRC-NEXT: v_mov_b32_e32 v43, s23 +; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s12 -; HEURRC-NEXT: v_mov_b32_e32 v1, s13 -; HEURRC-NEXT: v_mov_b32_e32 v2, s14 -; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -1687,7 +1653,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] @@ -1701,41 +1667,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] -; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: @@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 +; SDAG-NEXT: v_mov_b32_e32 v44, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 -; SDAG-NEXT: v_mov_b32_e32 v10, s20 -; SDAG-NEXT: v_mov_b32_e32 v11, s21 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v12, s22 -; SDAG-NEXT: v_mov_b32_e32 v13, s23 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 +; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: s_nop 2 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v44, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 -; HEURRC-NEXT: v_mov_b32_e32 v10, s20 -; HEURRC-NEXT: v_mov_b32_e32 v11, s21 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v12, s22 -; HEURRC-NEXT: v_mov_b32_e32 v13, s23 -; HEURRC-NEXT: v_mov_b32_e32 v0, s16 -; HEURRC-NEXT: v_mov_b32_e32 v1, s17 -; HEURRC-NEXT: v_mov_b32_e32 v2, s18 -; HEURRC-NEXT: v_mov_b32_e32 v3, s19 -; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; HEURRC-NEXT: v_mov_b32_e32 v40, s20 +; HEURRC-NEXT: v_mov_b32_e32 v41, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v42, s22 +; HEURRC-NEXT: v_mov_b32_e32 v43, s23 +; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s12 -; HEURRC-NEXT: v_mov_b32_e32 v1, s13 -; HEURRC-NEXT: v_mov_b32_e32 v2, s14 -; HEURRC-NEXT: v_mov_b32_e32 v3, s15 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v0, s8 -; HEURRC-NEXT: v_mov_b32_e32 v1, s9 -; HEURRC-NEXT: v_mov_b32_e32 v2, s10 -; HEURRC-NEXT: v_mov_b32_e32 v3, s11 -; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -2051,7 +1995,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] ; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] @@ -2065,41 +2009,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; VGPRRC-NEXT: v_mov_b32_e32 v42, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v43, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 ; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 -; VGPRRC-NEXT: v_mov_b32_e32 v44, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v45, s23 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 2 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: @@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: @@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: @@ -2781,24 +2677,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2827,24 +2721,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: v_mov_b32_e32 v6, s12 -; HEURRC-NEXT: v_mov_b32_e32 v7, s13 -; HEURRC-NEXT: v_mov_b32_e32 v8, s14 -; HEURRC-NEXT: v_mov_b32_e32 v9, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: @@ -2852,24 +2746,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: ; AGPR: ; %bb.0: @@ -2930,24 +2824,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b32_e32 v8, s0 +; SDAG-NEXT: v_mov_b32_e32 v9, s1 +; SDAG-NEXT: v_mov_b32_e32 v10, s2 +; SDAG-NEXT: v_mov_b32_e32 v11, s3 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -2976,24 +2868,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: v_mov_b32_e32 v6, s12 -; HEURRC-NEXT: v_mov_b32_e32 v7, s13 -; HEURRC-NEXT: v_mov_b32_e32 v8, s14 -; HEURRC-NEXT: v_mov_b32_e32 v9, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_mov_b32_e32 v8, s0 +; HEURRC-NEXT: v_mov_b32_e32 v9, s1 +; HEURRC-NEXT: v_mov_b32_e32 v10, s2 +; HEURRC-NEXT: v_mov_b32_e32 v11, s3 ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: @@ -3001,24 +2893,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; VGPRRC-NEXT: v_mov_b32_e32 v4, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v6, s8 -; VGPRRC-NEXT: v_mov_b32_e32 v7, s9 -; VGPRRC-NEXT: v_mov_b32_e32 v8, s10 -; VGPRRC-NEXT: v_mov_b32_e32 v9, s11 -; VGPRRC-NEXT: v_mov_b32_e32 v10, s12 -; VGPRRC-NEXT: v_mov_b32_e32 v11, s13 -; VGPRRC-NEXT: v_mov_b32_e32 v12, s14 -; VGPRRC-NEXT: v_mov_b32_e32 v13, s15 -; VGPRRC-NEXT: v_mov_b32_e32 v0, s0 -; VGPRRC-NEXT: v_mov_b32_e32 v1, s1 -; VGPRRC-NEXT: v_mov_b32_e32 v2, s2 -; VGPRRC-NEXT: v_mov_b32_e32 v3, s3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; VGPRRC-NEXT: s_nop 7 -; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: ; AGPR: ; %bb.0: @@ -4246,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v6, s24 -; SDAG-NEXT: v_mov_b32_e32 v7, s25 -; SDAG-NEXT: v_mov_b32_e32 v8, s26 -; SDAG-NEXT: v_mov_b32_e32 v9, s27 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4318,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4371,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v40, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v6, s24 -; HEURRC-NEXT: v_mov_b32_e32 v7, s25 -; HEURRC-NEXT: v_mov_b32_e32 v8, s26 -; HEURRC-NEXT: v_mov_b32_e32 v9, s27 +; HEURRC-NEXT: v_mov_b32_e32 v36, s24 +; HEURRC-NEXT: v_mov_b32_e32 v37, s25 +; HEURRC-NEXT: v_mov_b32_e32 v38, s26 +; HEURRC-NEXT: v_mov_b32_e32 v39, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s16 -; HEURRC-NEXT: v_mov_b32_e32 v3, s17 -; HEURRC-NEXT: v_mov_b32_e32 v4, s18 -; HEURRC-NEXT: v_mov_b32_e32 v5, s19 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s12 -; HEURRC-NEXT: v_mov_b32_e32 v3, s13 -; HEURRC-NEXT: v_mov_b32_e32 v4, s14 -; HEURRC-NEXT: v_mov_b32_e32 v5, s15 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4442,17 +4312,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4463,42 +4333,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: @@ -4645,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v40, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s20 +; SDAG-NEXT: v_mov_b32_e32 v33, s21 +; SDAG-NEXT: v_mov_b32_e32 v34, s22 +; SDAG-NEXT: v_mov_b32_e32 v35, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v6, s24 -; SDAG-NEXT: v_mov_b32_e32 v7, s25 -; SDAG-NEXT: v_mov_b32_e32 v8, s26 -; SDAG-NEXT: v_mov_b32_e32 v9, s27 +; SDAG-NEXT: v_mov_b32_e32 v36, s24 +; SDAG-NEXT: v_mov_b32_e32 v37, s25 +; SDAG-NEXT: v_mov_b32_e32 v38, s26 +; SDAG-NEXT: v_mov_b32_e32 v39, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s16 -; SDAG-NEXT: v_mov_b32_e32 v3, s17 -; SDAG-NEXT: v_mov_b32_e32 v4, s18 -; SDAG-NEXT: v_mov_b32_e32 v5, s19 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -4717,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; GISEL-NEXT: v_mov_b32_e32 v24, 0 +; GISEL-NEXT: v_mov_b32_e32 v56, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3 -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9] +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3 +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm ; @@ -4770,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; HEURRC: ; %bb.0: ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mov_b32_e32 v40, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 +; HEURRC-NEXT: v_mov_b32_e32 v32, s20 +; HEURRC-NEXT: v_mov_b32_e32 v33, s21 +; HEURRC-NEXT: v_mov_b32_e32 v34, s22 +; HEURRC-NEXT: v_mov_b32_e32 v35, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v6, s24 -; HEURRC-NEXT: v_mov_b32_e32 v7, s25 -; HEURRC-NEXT: v_mov_b32_e32 v8, s26 -; HEURRC-NEXT: v_mov_b32_e32 v9, s27 +; HEURRC-NEXT: v_mov_b32_e32 v36, s24 +; HEURRC-NEXT: v_mov_b32_e32 v37, s25 +; HEURRC-NEXT: v_mov_b32_e32 v38, s26 +; HEURRC-NEXT: v_mov_b32_e32 v39, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 -; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3 -; HEURRC-NEXT: v_mov_b32_e32 v2, s20 -; HEURRC-NEXT: v_mov_b32_e32 v3, s21 -; HEURRC-NEXT: v_mov_b32_e32 v4, s22 -; HEURRC-NEXT: v_mov_b32_e32 v5, s23 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: s_nop 6 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s16 -; HEURRC-NEXT: v_mov_b32_e32 v3, s17 -; HEURRC-NEXT: v_mov_b32_e32 v4, s18 -; HEURRC-NEXT: v_mov_b32_e32 v5, s19 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s16 +; HEURRC-NEXT: v_mov_b32_e32 v17, s17 +; HEURRC-NEXT: v_mov_b32_e32 v18, s18 +; HEURRC-NEXT: v_mov_b32_e32 v19, s19 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s12 -; HEURRC-NEXT: v_mov_b32_e32 v3, s13 -; HEURRC-NEXT: v_mov_b32_e32 v4, s14 -; HEURRC-NEXT: v_mov_b32_e32 v5, s15 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s12 +; HEURRC-NEXT: v_mov_b32_e32 v17, s13 +; HEURRC-NEXT: v_mov_b32_e32 v18, s14 +; HEURRC-NEXT: v_mov_b32_e32 v19, s15 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_nop 0 -; HEURRC-NEXT: v_mov_b32_e32 v2, s8 -; HEURRC-NEXT: v_mov_b32_e32 v3, s9 -; HEURRC-NEXT: v_mov_b32_e32 v4, s10 -; HEURRC-NEXT: v_mov_b32_e32 v5, s11 -; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1 +; HEURRC-NEXT: v_mov_b32_e32 v16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v17, s9 +; HEURRC-NEXT: v_mov_b32_e32 v18, s10 +; HEURRC-NEXT: v_mov_b32_e32 v19, s11 +; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; HEURRC-NEXT: s_waitcnt vmcnt(0) ; HEURRC-NEXT: s_endpgm ; @@ -4841,17 +4689,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC: ; %bb.0: ; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 -; VGPRRC-NEXT: v_mov_b32_e32 v32, 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) -; VGPRRC-NEXT: v_mov_b32_e32 v34, s20 -; VGPRRC-NEXT: v_mov_b32_e32 v35, s21 -; VGPRRC-NEXT: v_mov_b32_e32 v36, s22 -; VGPRRC-NEXT: v_mov_b32_e32 v37, s23 +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 ; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; VGPRRC-NEXT: v_mov_b32_e32 v38, s24 -; VGPRRC-NEXT: v_mov_b32_e32 v39, s25 -; VGPRRC-NEXT: v_mov_b32_e32 v40, s26 -; VGPRRC-NEXT: v_mov_b32_e32 v41, s27 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 ; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) ; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] ; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] @@ -4862,42 +4710,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] ; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; VGPRRC-NEXT: s_nop 1 -; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 ; VGPRRC-NEXT: s_nop 6 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_nop 0 ; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 ; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 ; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 ; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 -; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) -; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 ; VGPRRC-NEXT: s_waitcnt vmcnt(0) ; VGPRRC-NEXT: s_endpgm ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: @@ -5045,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5088,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5124,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v4, s24 -; HEURRC-NEXT: v_mov_b32_e32 v5, s25 -; HEURRC-NEXT: v_mov_b32_e32 v6, s26 -; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: v_mov_b32_e32 v20, s24 +; HEURRC-NEXT: v_mov_b32_e32 v21, s25 +; HEURRC-NEXT: v_mov_b32_e32 v22, s26 +; HEURRC-NEXT: v_mov_b32_e32 v23, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: @@ -5279,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s20 -; SDAG-NEXT: v_mov_b32_e32 v1, s21 -; SDAG-NEXT: v_mov_b32_e32 v2, s22 -; SDAG-NEXT: v_mov_b32_e32 v3, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; SDAG-NEXT: v_mov_b32_e32 v4, s24 -; SDAG-NEXT: v_mov_b32_e32 v5, s25 -; SDAG-NEXT: v_mov_b32_e32 v6, s26 -; SDAG-NEXT: v_mov_b32_e32 v7, s27 +; SDAG-NEXT: v_mov_b32_e32 v20, s24 +; SDAG-NEXT: v_mov_b32_e32 v21, s25 +; SDAG-NEXT: v_mov_b32_e32 v22, s26 +; SDAG-NEXT: v_mov_b32_e32 v23, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5322,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5358,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 ; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_mov_b32_e32 v0, s20 -; HEURRC-NEXT: v_mov_b32_e32 v1, s21 -; HEURRC-NEXT: v_mov_b32_e32 v2, s22 -; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v16, s20 +; HEURRC-NEXT: v_mov_b32_e32 v17, s21 +; HEURRC-NEXT: v_mov_b32_e32 v18, s22 +; HEURRC-NEXT: v_mov_b32_e32 v19, s23 ; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 -; HEURRC-NEXT: v_mov_b32_e32 v4, s24 -; HEURRC-NEXT: v_mov_b32_e32 v5, s25 -; HEURRC-NEXT: v_mov_b32_e32 v6, s26 -; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: v_mov_b32_e32 v20, s24 +; HEURRC-NEXT: v_mov_b32_e32 v21, s25 +; HEURRC-NEXT: v_mov_b32_e32 v22, s26 +; HEURRC-NEXT: v_mov_b32_e32 v23, s27 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 -; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 -; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 -; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 -; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 -; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 -; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 -; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 -; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 -; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 -; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 -; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 -; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 -; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v16, 0 ; HEURRC-NEXT: s_nop 7 ; HEURRC-NEXT: s_nop 2 -; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: @@ -5643,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5664,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: @@ -5747,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_accvgpr_write_b32 a1, s1 -; GCN-NEXT: v_accvgpr_write_b32 a2, s2 -; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GCN-NEXT: s_nop 1 -; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; GCN-NEXT: s_endpgm ; ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5768,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 ; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 ; HEURRC-NEXT: s_waitcnt lgkmcnt(0) ; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] ; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 -; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 -; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; HEURRC-NEXT: s_nop 1 -; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 ; HEURRC-NEXT: s_nop 7 -; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] ; HEURRC-NEXT: s_endpgm ; ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: @@ -5845,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ret void } -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="1,64" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll index 37809da..f78ea92 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll @@ -1895,36 +1895,36 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s12, v1 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s12, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[14:15] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[14:15] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd: @@ -1937,20 +1937,18 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s24 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s25 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s26 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s27 -; GISEL-NEXT: v_mov_b32_e32 v16, s29 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; GISEL-NEXT: v_mov_b32_e32 v20, s29 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s28, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s28, v20 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[30:31] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -1964,40 +1962,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2005,19 +2001,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2031,40 +2025,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 ; SDAG-NEXT: s_movk_i32 s6, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], s6, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] @@ -2072,19 +2064,17 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, 1.0 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2096,34 +2086,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm: @@ -2136,21 +2124,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2162,34 +2148,32 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 ; SDAG-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; SDAG-NEXT: v_mov_b32_e32 v0, s8 +; SDAG-NEXT: v_mov_b32_e32 v1, s9 +; SDAG-NEXT: v_mov_b32_e32 v2, s10 +; SDAG-NEXT: v_mov_b32_e32 v3, s11 +; SDAG-NEXT: v_mov_b32_e32 v4, s12 +; SDAG-NEXT: v_mov_b32_e32 v5, s13 +; SDAG-NEXT: v_mov_b32_e32 v6, s14 +; SDAG-NEXT: v_mov_b32_e32 v7, s15 +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; SDAG-NEXT: v_mov_b32_e32 v8, s16 +; SDAG-NEXT: v_mov_b32_e32 v9, s17 +; SDAG-NEXT: v_mov_b32_e32 v10, s18 +; SDAG-NEXT: v_mov_b32_e32 v11, s19 +; SDAG-NEXT: v_mov_b32_e32 v12, s20 +; SDAG-NEXT: v_mov_b32_e32 v13, s21 +; SDAG-NEXT: v_mov_b32_e32 v14, s22 +; SDAG-NEXT: v_mov_b32_e32 v15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[2:9], v[10:17], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 3 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; SDAG-NEXT: global_store_dwordx4 v20, v[0:3], s[4:5] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal: @@ -2202,21 +2186,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] ; GISEL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x50 ; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], 1.0, 0.15915494 op_sel:[1,1,0] op_sel_hi:[1,0,0] +; GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5] +; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491) store <4 x float> %result, ptr addrspace(1) %ptr, align 16 @@ -2559,5 +2541,5 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll index bc50058..0b2818f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll @@ -4539,49 +4539,41 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; SDAG-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v2, s8 -; SDAG-NEXT: v_mov_b32_e32 v3, s9 -; SDAG-NEXT: v_mov_b32_e32 v4, s10 -; SDAG-NEXT: v_mov_b32_e32 v5, s11 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v10, s16 -; SDAG-NEXT: v_mov_b32_e32 v11, s17 -; SDAG-NEXT: v_mov_b32_e32 v12, s18 -; SDAG-NEXT: v_mov_b32_e32 v13, s19 -; SDAG-NEXT: v_mov_b32_e32 v14, s20 -; SDAG-NEXT: v_mov_b32_e32 v15, s21 -; SDAG-NEXT: v_mov_b32_e32 v16, s22 -; SDAG-NEXT: v_mov_b32_e32 v17, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 -; SDAG-NEXT: v_mov_b32_e32 v0, s1 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; SDAG-NEXT: v_mov_b32_e32 v32, s1 ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], s0, v0 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd: @@ -4590,41 +4582,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32> ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 ; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 -; GISEL-NEXT: v_mov_b32_e32 v16, s1 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] +; GISEL-NEXT: v_mov_b32_e32 v32, s1 ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s0, v16 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s0, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[2:3] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[2:3] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[2:3] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -4639,91 +4623,75 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_ ; SDAG-NEXT: s_movk_i32 s2, 0x41 ; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s8 -; SDAG-NEXT: v_mov_b32_e32 v1, s9 -; SDAG-NEXT: v_mov_b32_e32 v2, s10 -; SDAG-NEXT: v_mov_b32_e32 v3, s11 -; SDAG-NEXT: v_mov_b32_e32 v4, s12 -; SDAG-NEXT: v_mov_b32_e32 v5, s13 -; SDAG-NEXT: v_mov_b32_e32 v6, s14 -; SDAG-NEXT: v_mov_b32_e32 v7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s36 -; SDAG-NEXT: v_mov_b32_e32 v8, s16 -; SDAG-NEXT: v_mov_b32_e32 v9, s17 -; SDAG-NEXT: v_mov_b32_e32 v10, s18 -; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: v_mov_b32_e32 v12, s20 -; SDAG-NEXT: v_mov_b32_e32 v13, s21 -; SDAG-NEXT: v_mov_b32_e32 v14, s22 -; SDAG-NEXT: v_mov_b32_e32 v15, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s37 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s38 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s39 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s40 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s41 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s42 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s43 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s44 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s45 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s46 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s47 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s48 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s49 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s50 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s51 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: v_mov_b32_e32 v20, s12 +; SDAG-NEXT: v_mov_b32_e32 v21, s13 +; SDAG-NEXT: v_mov_b32_e32 v22, s14 +; SDAG-NEXT: v_mov_b32_e32 v23, s15 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; SDAG-NEXT: v_mov_b32_e32 v24, s16 +; SDAG-NEXT: v_mov_b32_e32 v25, s17 +; SDAG-NEXT: v_mov_b32_e32 v26, s18 +; SDAG-NEXT: v_mov_b32_e32 v27, s19 +; SDAG-NEXT: v_mov_b32_e32 v28, s20 +; SDAG-NEXT: v_mov_b32_e32 v29, s21 +; SDAG-NEXT: v_mov_b32_e32 v30, s22 +; SDAG-NEXT: v_mov_b32_e32 v31, s23 +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], s2, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 -; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b32_e32 v16, 0x41 +; GISEL-NEXT: v_mov_b32_e32 v32, 0x41 ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s36 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s37 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s38 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s39 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s40 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s41 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s42 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s43 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s44 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s45 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s46 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s47 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s48 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s49 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s50 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s51 +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] ; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, -2 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2 +; GISEL-NEXT: v_mov_b32_e32 v16, 0 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 7 ; GISEL-NEXT: s_nop 2 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] -; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 -; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 -; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 3, i32 65, i32 1, i32 -2) store <16 x float> %result, ptr addrspace(1) %ptr, align 64 @@ -5031,77 +4999,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v32, s12 +; SDAG-NEXT: v_mov_b32_e32 v33, s13 +; SDAG-NEXT: v_mov_b32_e32 v34, s14 +; SDAG-NEXT: v_mov_b32_e32 v35, s15 +; SDAG-NEXT: v_mov_b32_e32 v36, s16 +; SDAG-NEXT: v_mov_b32_e32 v37, s17 +; SDAG-NEXT: v_mov_b32_e32 v38, s18 +; SDAG-NEXT: v_mov_b32_e32 v39, s19 +; SDAG-NEXT: v_mov_b32_e32 v40, s20 +; SDAG-NEXT: v_mov_b32_e32 v41, s21 +; SDAG-NEXT: v_mov_b32_e32 v42, s22 +; SDAG-NEXT: v_mov_b32_e32 v43, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v44, s24 +; SDAG-NEXT: v_mov_b32_e32 v45, s25 +; SDAG-NEXT: v_mov_b32_e32 v46, s26 +; SDAG-NEXT: v_mov_b32_e32 v47, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a31, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a30, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a29, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a28, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a27, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a26, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a25, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a24, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a23, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a22, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a21, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a20, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a19, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a18, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a17, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a16, s8 +; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; SDAG-NEXT: s_nop 7 +; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5109,61 +5072,45 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a31, s23 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a30, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a29, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a28, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a27, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a26, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a25, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a24, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a23, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a22, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a21, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a20, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a19, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a18, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a17, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a16, s8 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] blgp:2 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: s_nop 7 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0) @@ -5177,77 +5124,70 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; SDAG: ; %bb.0: ; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v0, s12 -; SDAG-NEXT: v_mov_b32_e32 v1, s13 -; SDAG-NEXT: v_mov_b32_e32 v2, s14 -; SDAG-NEXT: v_mov_b32_e32 v3, s15 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v8, s20 -; SDAG-NEXT: v_mov_b32_e32 v9, s21 -; SDAG-NEXT: v_mov_b32_e32 v10, s22 -; SDAG-NEXT: v_mov_b32_e32 v11, s23 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s16 +; SDAG-NEXT: v_mov_b32_e32 v21, s17 +; SDAG-NEXT: v_mov_b32_e32 v22, s18 +; SDAG-NEXT: v_mov_b32_e32 v23, s19 +; SDAG-NEXT: v_mov_b32_e32 v24, s20 +; SDAG-NEXT: v_mov_b32_e32 v25, s21 +; SDAG-NEXT: v_mov_b32_e32 v26, s22 +; SDAG-NEXT: v_mov_b32_e32 v27, s23 ; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 +; SDAG-NEXT: v_mov_b32_e32 v28, s24 +; SDAG-NEXT: v_mov_b32_e32 v29, s25 +; SDAG-NEXT: v_mov_b32_e32 v30, s26 +; SDAG-NEXT: v_mov_b32_e32 v31, s27 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, s8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s11 -; SDAG-NEXT: v_accvgpr_write_b32 a4, s12 -; SDAG-NEXT: v_accvgpr_write_b32 a5, s13 -; SDAG-NEXT: v_accvgpr_write_b32 a6, s14 -; SDAG-NEXT: v_accvgpr_write_b32 a7, s15 -; SDAG-NEXT: v_accvgpr_write_b32 a8, s16 -; SDAG-NEXT: v_accvgpr_write_b32 a9, s17 -; SDAG-NEXT: v_accvgpr_write_b32 a10, s18 -; SDAG-NEXT: v_accvgpr_write_b32 a11, s19 -; SDAG-NEXT: v_accvgpr_write_b32 a12, s20 -; SDAG-NEXT: v_accvgpr_write_b32 a13, s21 -; SDAG-NEXT: v_accvgpr_write_b32 a14, s22 -; SDAG-NEXT: v_accvgpr_write_b32 a15, s23 +; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23] ; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 -; SDAG-NEXT: v_mov_b32_e32 v2, s20 -; SDAG-NEXT: v_mov_b32_e32 v3, s21 -; SDAG-NEXT: v_mov_b32_e32 v4, s22 -; SDAG-NEXT: v_mov_b32_e32 v5, s23 -; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48 -; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1 +; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; SDAG-NEXT: v_mov_b32_e32 v16, s20 +; SDAG-NEXT: v_mov_b32_e32 v17, s21 +; SDAG-NEXT: v_mov_b32_e32 v18, s22 +; SDAG-NEXT: v_mov_b32_e32 v19, s23 +; SDAG-NEXT: v_mov_b64_e32 v[20:21], 48 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s18 -; SDAG-NEXT: v_mov_b32_e32 v7, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s16 -; SDAG-NEXT: v_mov_b32_e32 v5, s17 -; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32 -; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[22:23], 32 +; SDAG-NEXT: v_mov_b64_e32 v[24:25], 16 +; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s17 +; SDAG-NEXT: v_mov_b32_e32 v18, s18 +; SDAG-NEXT: v_mov_b32_e32 v19, s19 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s14 -; SDAG-NEXT: v_mov_b32_e32 v9, s15 -; SDAG-NEXT: v_mov_b32_e32 v6, s12 -; SDAG-NEXT: v_mov_b32_e32 v7, s13 -; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16 -; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1 +; SDAG-NEXT: v_mov_b64_e32 v[26:27], 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v10, s10 -; SDAG-NEXT: v_mov_b32_e32 v11, s11 -; SDAG-NEXT: v_mov_b32_e32 v8, s8 -; SDAG-NEXT: v_mov_b32_e32 v9, s9 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0 -; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1 +; SDAG-NEXT: s_nop 0 +; SDAG-NEXT: v_mov_b32_e32 v16, s8 +; SDAG-NEXT: v_mov_b32_e32 v17, s9 +; SDAG-NEXT: v_mov_b32_e32 v18, s10 +; SDAG-NEXT: v_mov_b32_e32 v19, s11 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1 +; SDAG-NEXT: global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_endpgm ; @@ -5255,61 +5195,53 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non ; GISEL: ; %bb.0: ; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0 ; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 -; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0 -; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16 -; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32 +; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0 +; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s8 -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s11 -; GISEL-NEXT: v_accvgpr_write_b32 a4, s12 -; GISEL-NEXT: v_accvgpr_write_b32 a5, s13 -; GISEL-NEXT: v_accvgpr_write_b32 a6, s14 -; GISEL-NEXT: v_accvgpr_write_b32 a7, s15 -; GISEL-NEXT: v_accvgpr_write_b32 a8, s16 -; GISEL-NEXT: v_accvgpr_write_b32 a9, s17 -; GISEL-NEXT: v_accvgpr_write_b32 a10, s18 -; GISEL-NEXT: v_accvgpr_write_b32 a11, s19 -; GISEL-NEXT: v_accvgpr_write_b32 a12, s20 -; GISEL-NEXT: v_accvgpr_write_b32 a13, s21 -; GISEL-NEXT: v_accvgpr_write_b32 a14, s22 -; GISEL-NEXT: v_accvgpr_write_b32 a15, s23 -; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48 -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45] ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51] ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17] ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23] -; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48 +; GISEL-NEXT: s_nop 0 +; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2 +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_nop 3 -; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1 +; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42) @@ -6298,6 +6230,6 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32>, <4 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <16 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #2 -attributes #0 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" } attributes #1 = { "amdgpu-flat-work-group-size"="128,128" } attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll index ea9334a..31a48de 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942-STRESS,GFX942-SDAG-STRESS %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942-STRESS,GFX942-GISEL-STRESS %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32) @@ -51,50 +49,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: s_nop 5 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm -; -; GFX942-SDAG-STRESS-LABEL: test_mfma_f32_16x16x8xf32: -; GFX942-SDAG-STRESS: ; %bb.0: ; %bb -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v2, 0x40400000 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v3, 4.0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-STRESS-NEXT: s_nop 1 -; GFX942-SDAG-STRESS-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-STRESS-NEXT: s_nop 6 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-SDAG-STRESS-NEXT: s_endpgm -; -; GFX942-GISEL-STRESS-LABEL: test_mfma_f32_16x16x8xf32: -; GFX942-GISEL-STRESS: ; %bb.0: ; %bb -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 1.0 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s2, 0x40400000 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 2.0 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s3, 4.0 -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-STRESS-NEXT: s_nop 1 -; GFX942-GISEL-STRESS-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-STRESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-STRESS-NEXT: s_nop 5 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GFX942-GISEL-STRESS-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -178,82 +132,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-GISEL-NEXT: s_endpgm -; -; GFX942-SDAG-STRESS-LABEL: test_mfma_f32_32x32x4xf32: -; GFX942-SDAG-STRESS: ; %bb.0: ; %bb -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v2, 0x40400000 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v3, 4.0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-SDAG-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-SDAG-STRESS-NEXT: v_accvgpr_write_b32 a15, s15 -; GFX942-SDAG-STRESS-NEXT: s_nop 1 -; GFX942-SDAG-STRESS-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-STRESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-SDAG-STRESS-NEXT: s_nop 7 -; GFX942-SDAG-STRESS-NEXT: s_nop 1 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-SDAG-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-SDAG-STRESS-NEXT: s_endpgm -; -; GFX942-GISEL-STRESS-LABEL: test_mfma_f32_32x32x4xf32: -; GFX942-GISEL-STRESS: ; %bb.0: ; %bb -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 -; GFX942-GISEL-STRESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a4, s4 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a5, s5 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a6, s6 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a7, s7 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a8, s8 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a9, s9 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a10, s10 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a11, s11 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a12, s12 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a13, s13 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a14, s14 -; GFX942-GISEL-STRESS-NEXT: v_accvgpr_write_b32 a15, s15 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 1.0 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 2.0 -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s0, 0x40400000 -; GFX942-GISEL-STRESS-NEXT: s_mov_b32 s1, 4.0 -; GFX942-GISEL-STRESS-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX942-GISEL-STRESS-NEXT: s_nop 1 -; GFX942-GISEL-STRESS-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 -; GFX942-GISEL-STRESS-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-GISEL-STRESS-NEXT: s_nop 7 -; GFX942-GISEL-STRESS-NEXT: s_nop 1 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 -; GFX942-GISEL-STRESS-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 -; GFX942-GISEL-STRESS-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3) @@ -264,4 +142,3 @@ bb: attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX942: {{.*}} -; GFX942-STRESS: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 8056881..b25fe83 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) % ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; SDAG-NEXT: s_waitcnt lgkmcnt(0) ; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b32_e32 v13, s16 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr: @@ -547,24 +547,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7] +; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 ; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 -; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3] -; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] ; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GCN-NEXT: v_mov_b32_e32 v13, s16 +; GCN-NEXT: v_mov_b32_e32 v17, s16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[14:17], v[8:11], v[0:7], v13 cbsz:1 abid:2 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: global_store_dwordx4 v12, v[14:17], s[6:7] +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -855,30 +855,30 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) % ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__vgpr: @@ -1032,22 +1032,22 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -1397,30 +1397,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr: @@ -1566,30 +1566,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr: @@ -1735,30 +1735,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr: @@ -1904,30 +1904,30 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[10:13], v0, s[6:7] +; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: v_mov_b32_e32 v0, 0 -; SDAG-NEXT: v_mov_b32_e32 v14, s8 -; SDAG-NEXT: v_mov_b32_e32 v15, s9 -; SDAG-NEXT: v_mov_b32_e32 v16, s10 -; SDAG-NEXT: v_mov_b32_e32 v17, s11 -; SDAG-NEXT: v_mov_b32_e32 v2, s12 -; SDAG-NEXT: v_mov_b32_e32 v3, s13 -; SDAG-NEXT: v_mov_b32_e32 v4, s14 -; SDAG-NEXT: v_mov_b32_e32 v5, s15 +; SDAG-NEXT: v_mov_b32_e32 v12, s8 +; SDAG-NEXT: v_mov_b32_e32 v13, s9 +; SDAG-NEXT: v_mov_b32_e32 v14, s10 +; SDAG-NEXT: v_mov_b32_e32 v15, s11 +; SDAG-NEXT: v_mov_b32_e32 v0, s12 +; SDAG-NEXT: v_mov_b32_e32 v1, s13 +; SDAG-NEXT: v_mov_b32_e32 v2, s14 +; SDAG-NEXT: v_mov_b32_e32 v3, s15 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v6, s0 -; SDAG-NEXT: v_mov_b32_e32 v7, s1 -; SDAG-NEXT: v_mov_b32_e32 v8, s2 -; SDAG-NEXT: v_mov_b32_e32 v9, s3 -; SDAG-NEXT: v_mov_b32_e32 v1, s16 +; SDAG-NEXT: v_mov_b32_e32 v4, s0 +; SDAG-NEXT: v_mov_b32_e32 v5, s1 +; SDAG-NEXT: v_mov_b32_e32 v6, s2 +; SDAG-NEXT: v_mov_b32_e32 v7, s3 +; SDAG-NEXT: v_mov_b32_e32 v17, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[10:13], v[14:17], v[2:9], v1 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: global_store_dwordx4 v0, v[10:13], s[6:7] +; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr: @@ -2081,22 +2081,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2454,22 +2454,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -2827,22 +2827,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -3200,22 +3200,22 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace( ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v26, s8 -; SDAG-NEXT: v_mov_b32_e32 v27, s9 -; SDAG-NEXT: v_mov_b32_e32 v28, s10 -; SDAG-NEXT: v_mov_b32_e32 v29, s11 -; SDAG-NEXT: v_mov_b32_e32 v18, s12 -; SDAG-NEXT: v_mov_b32_e32 v19, s13 -; SDAG-NEXT: v_mov_b32_e32 v20, s14 -; SDAG-NEXT: v_mov_b32_e32 v21, s15 -; SDAG-NEXT: v_mov_b32_e32 v22, s0 -; SDAG-NEXT: v_mov_b32_e32 v23, s1 -; SDAG-NEXT: v_mov_b32_e32 v24, s2 -; SDAG-NEXT: v_mov_b32_e32 v25, s3 -; SDAG-NEXT: v_mov_b32_e32 v16, s16 +; SDAG-NEXT: v_mov_b32_e32 v24, s8 +; SDAG-NEXT: v_mov_b32_e32 v25, s9 +; SDAG-NEXT: v_mov_b32_e32 v26, s10 +; SDAG-NEXT: v_mov_b32_e32 v27, s11 +; SDAG-NEXT: v_mov_b32_e32 v16, s12 +; SDAG-NEXT: v_mov_b32_e32 v17, s13 +; SDAG-NEXT: v_mov_b32_e32 v18, s14 +; SDAG-NEXT: v_mov_b32_e32 v19, s15 +; SDAG-NEXT: v_mov_b32_e32 v20, s0 +; SDAG-NEXT: v_mov_b32_e32 v21, s1 +; SDAG-NEXT: v_mov_b32_e32 v22, s2 +; SDAG-NEXT: v_mov_b32_e32 v23, s3 +; SDAG-NEXT: v_mov_b32_e32 v28, s16 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2 +; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 ; SDAG-NEXT: v_mov_b32_e32 v16, 0 ; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: s_nop 2 @@ -3552,4 +3552,4 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg ret <16 x float> %result } -attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0,0" } diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll index 84123e6..393581f 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll @@ -141,7 +141,6 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] ; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index bc25084..5e5e3bf 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -415,11 +415,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -432,11 +427,6 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { @@ -562,11 +552,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-WGP-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 -; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 -; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 -; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 -; GFX12-WGP-NEXT: s_wait_storecnt 0x0 ; GFX12-WGP-NEXT: ds_store_b32 v0, v1 ; GFX12-WGP-NEXT: s_endpgm ; @@ -583,11 +568,6 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: v_lshl_add_u32 v0, v0, s1, s2 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX12-CU-NEXT: s_wait_loadcnt 0x0 -; GFX12-CU-NEXT: s_wait_samplecnt 0x0 -; GFX12-CU-NEXT: s_wait_bvhcnt 0x0 -; GFX12-CU-NEXT: s_wait_kmcnt 0x0 -; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll index 053038d..382d892 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubo.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll @@ -1,14 +1,116 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s - +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone declare { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -; FUNC-LABEL: {{^}}ssubo_i64_zext: define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) nounwind { +; SI-LABEL: ssubo_i64_zext: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_sub_u32 s10, s2, s8 +; SI-NEXT: s_subb_u32 s11, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] +; SI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[8:9], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: ssubo_i64_zext: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_sub_u32 s6, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: s_subb_u32 s7, s3, s5 +; VI-NEXT: v_cmp_gt_i64_e64 s[8:9], s[4:5], 0 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: ssubo_i64_zext: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_sub_u32 s4, s2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_subb_u32 s5, s3, s7 +; GFX9-NEXT: v_cmp_gt_i64_e64 s[8:9], s[6:7], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: ssubo_i64_zext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s4, s2, s6 +; GFX10-NEXT: s_subb_u32 s5, s3, s7 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: s_xor_b32 s2, s6, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: ssubo_i64_zext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s6, s2, s4 +; GFX11-NEXT: s_subb_u32 s7, s3, s5 +; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] +; GFX11-NEXT: s_xor_b32 s2, s4, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -18,8 +120,102 @@ define amdgpu_kernel void @ssubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ret void } -; FUNC-LABEL: {{^}}s_ssubo_i32: define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) nounwind { +; SI-LABEL: s_ssubo_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_sub_i32 s12, s8, s9 +; SI-NEXT: s_cmp_gt_i32 s9, 0 +; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s12, s8 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9] +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_ssubo_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_i32 s6, s4, s5 +; VI-NEXT: s_cmp_gt_i32 s5, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_lt_i32 s6, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_ssubo_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s4, s6, s7 +; GFX9-NEXT: v_sub_i32 v1, s6, v1 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_byte v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_ssubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_sub_nc_i32 v0, s6, s7 clamp +; GFX10-NEXT: s_sub_i32 s4, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v1, v2, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_ssubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_nc_i32 v0, s6, s7 clamp +; GFX11-NEXT: s_sub_i32 s4, s6, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX11-NEXT: s_endpgm %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind %val = extractvalue { i32, i1 } %ssub, 0 %carry = extractvalue { i32, i1 } %ssub, 1 @@ -28,8 +224,112 @@ define amdgpu_kernel void @s_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}v_ssubo_i32: define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { +; SI-LABEL: v_ssubo_i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ssubo_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: flat_load_dword v5, v[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v5 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: flat_store_dword v[0:1], v6 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_ssubo_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[12:13] +; GFX9-NEXT: global_load_dword v2, v0, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_i32 v3, v1, v2 clamp +; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3 +; GFX9-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: global_store_byte v0, v1, s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_ssubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_i32 v3, v1, v2 clamp +; GFX10-NEXT: v_sub_nc_u32_e32 v1, v1, v2 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_ssubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_i32 v3, v1, v2 clamp +; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %aptr, align 4 %b = load i32, ptr addrspace(1) %bptr, align 4 %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind @@ -40,10 +340,109 @@ define amdgpu_kernel void @v_ssubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}s_ssubo_i64: -; GCN: s_sub_u32 -; GCN: s_subb_u32 define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) nounwind { +; SI-LABEL: s_ssubo_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_sub_u32 s12, s4, s6 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_subb_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] +; SI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[6:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: s_ssubo_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_sub_u32 s0, s4, s6 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_subb_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] +; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: s_ssubo_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_sub_u32 s0, s12, s14 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_subb_u32 s1, s13, s15 +; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[14:15], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: global_store_byte v2, v0, s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_ssubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s0, s12, s14 +; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_xor_b32 s0, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v2, v3, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_ssubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s8, s4, s6 +; GFX11-NEXT: s_subb_u32 s9, s5, s7 +; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[6:7], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] +; GFX11-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_xor_b32 s4, s6, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_endpgm %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %ssub, 0 %carry = extractvalue { i64, i1 } %ssub, 1 @@ -52,16 +451,121 @@ define amdgpu_kernel void @s_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}v_ssubo_i64: -; SI: v_sub_i32_e32 v{{[0-9]+}}, vcc, -; SI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; VI: v_sub_u32_e32 v{{[0-9]+}}, vcc, -; VI: v_subb_u32_e32 v{{[0-9]+}}, vcc, - -; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, -; GFX9: v_subb_co_u32_e32 v{{[0-9]+}}, vcc, define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { +; SI-LABEL: v_ssubo_i64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc +; SI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ssubo_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 +; VI-NEXT: v_subb_u32_e32 v9, vcc, v1, v3, vcc +; VI-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_byte v[6:7], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_ssubo_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: global_store_byte v6, v0, s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_ssubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX10-NEXT: global_store_byte v6, v0, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_ssubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] +; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] +; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] +; GFX11-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %aptr, align 4 %b = load i64, ptr addrspace(1) %bptr, align 4 %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind @@ -72,14 +576,134 @@ define amdgpu_kernel void @v_ssubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; FUNC-LABEL: {{^}}v_ssubo_v2i32: -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_sub_{{[iu]}}32 -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_cmp_lt_i32 -; SICIVI: v_sub_{{[iu]}}32 define amdgpu_kernel void @v_ssubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { +; SI-LABEL: v_ssubo_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v5, vcc, v1, v3 +; SI-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 +; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3 +; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 +; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: s_endpgm +; +; VI-LABEL: v_ssubo_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 +; VI-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 +; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], 0, v3 +; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 +; VI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 +; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] +; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: v_ssubo_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[12:13] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v5, v1, v3 +; GFX9-NEXT: v_sub_i32 v1, v1, v3 clamp +; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2 +; GFX9-NEXT: v_sub_i32 v0, v0, v2 clamp +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v5, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[10:11] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_ssubo_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_u32_e32 v4, v1, v3 +; GFX10-NEXT: v_sub_nc_i32 v1, v1, v3 clamp +; GFX10-NEXT: v_sub_nc_u32_e32 v3, v0, v2 +; GFX10-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_ssubo_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v5, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v1, v3 +; GFX11-NEXT: v_sub_nc_i32 v1, v1, v3 clamp +; GFX11-NEXT: v_sub_nc_u32_e32 v3, v0, v2 +; GFX11-NEXT: v_sub_nc_i32 v0, v0, v2 clamp +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1] +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3] +; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.ssub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index d230ff5..e1574dc 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_uaddo_i64_zext: @@ -12,14 +14,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_add_u32 s0, s2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_addc_u32 s1, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -61,6 +63,40 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_i64_zext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s4, s2, s6 +; GFX10-NEXT: s_addc_u32 s5, s3, s7 +; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_i64_zext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s4, s2, s4 +; GFX11-NEXT: s_addc_u32 s5, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[4:5], s[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 @@ -76,21 +112,21 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-LABEL: s_uaddo_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_add_i32_e32 v0, vcc, s12, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_i32: @@ -121,6 +157,34 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s4, s6, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v2, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, s6, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 %carry = extractvalue { i32, i1 } %uadd, 1 @@ -137,17 +201,15 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -193,6 +255,38 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -215,17 +309,15 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -283,6 +375,45 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i32_novcc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i32_novcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -306,21 +437,21 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s6, s4, s6 -; SI-NEXT: s_addc_u32 s7, s5, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_addc_u32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_i64: @@ -359,6 +490,37 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_add_u32 s0, s12, s14 +; GFX10-NEXT: s_addc_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v2, v3, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_add_u32 s6, s4, s6 +; GFX11-NEXT: s_addc_u32 s7, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 %carry = extractvalue { i64, i1 } %uadd, 1 @@ -375,17 +537,15 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -393,8 +553,8 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -437,6 +597,42 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX10-NEXT: global_store_byte v4, v0, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: global_store_b8 v4, v0, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr @@ -459,17 +655,15 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -477,8 +671,8 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 @@ -522,6 +716,42 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_short v0, v2, s[8:9] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX10-NEXT: global_load_ushort v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v2 +; GFX10-NEXT: v_cmp_lt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: global_store_short v0, v2, s[8:9] +; GFX10-NEXT: global_store_byte v0, v1, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5] +; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v2, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v3, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr @@ -544,17 +774,15 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -606,6 +834,42 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v1, s4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: v_add_co_u32 v0, s4, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind @@ -623,26 +887,27 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uaddo_clamp_bit: @@ -687,6 +952,45 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_dword v1, v0, s[8:9] ; GFX9-NEXT: global_store_byte v1, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_uaddo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, s1, s2, s3 +; GFX10-NEXT: s_cmp_eq_u32 s2, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB8_2: ; %exit +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[8:9] +; GFX10-NEXT: global_store_byte v1, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_uaddo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, s1, s2, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s0, s1, -1 +; GFX11-NEXT: .LBB8_2: ; %exit +; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_store_b8 v1, v2, s[6:7] +; GFX11-NEXT: s_endpgm entry: %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %uadd, 0 @@ -711,19 +1015,19 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 @@ -786,6 +1090,50 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_uaddo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_add_co_u32 v1, s1, v1, v2 +; GFX10-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB9_2: ; %exit +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_uaddo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_add_co_u32 v1, s5, v1, v2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s4, s5, -1 +; GFX11-NEXT: .LBB9_2: ; %exit +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -813,23 +1161,23 @@ exit: define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 %b) { ; SI-LABEL: sv_uaddo_i128: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; SI-NEXT: v_mov_b32_e32 v6, s1 -; SI-NEXT: v_mov_b32_e32 v7, s2 -; SI-NEXT: v_mov_b32_e32 v8, s3 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; SI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], v[2:3] -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc +; SI-NEXT: v_mov_b32_e32 v6, s2 +; SI-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc +; SI-NEXT: v_mov_b32_e32 v6, s3 +; SI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3] +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[4:5] +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; SI-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[4:5] +; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SI-NEXT: s_mov_b32 s5, s6 ; SI-NEXT: v_and_b32_e32 v2, 1, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm @@ -871,6 +1219,41 @@ define amdgpu_cs void @sv_uaddo_i128(ptr addrspace(1) %out, i128 inreg %a, i128 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: sv_uaddo_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo +; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sv_uaddo_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v4, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[2:3] +; GFX11-NEXT: v_mov_b16_e32 v2.l, v6.l +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_endpgm %uadd = call { i128, i1 } @llvm.uadd.with.overflow.i128(i128 %a, i128 %b) %carry = extractvalue { i128, i1 } %uadd, 1 %carry.ext = zext i1 %carry to i32 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index 7d7f1b4..0289dab 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s - +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti | FileCheck %s --check-prefix=SI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=VI +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s --check-prefix=GFX9 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GFX10 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s --check-prefix=GFX11 define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 { ; SI-LABEL: s_usubo_i64_zext: @@ -13,14 +14,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_sub_u32 s0, s2, s8 ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_subb_u32 s1, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -62,6 +63,40 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 % ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_i64_zext: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s4, s2, s6 +; GFX10-NEXT: s_subb_u32 s5, s3, s7 +; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_i64_zext: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s4, s2, s4 +; GFX11-NEXT: s_subb_u32 s5, s3, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], s[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, s2, s4, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s2 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0 %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 @@ -76,21 +111,21 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-LABEL: s_usubo_i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s8, s2 -; SI-NEXT: s_mov_b32 s9, s3 -; SI-NEXT: v_mov_b32_e32 v0, s13 -; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s3, s7 ; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_i32: @@ -121,6 +156,34 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s4, s6, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v2, s[2:3] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, s6, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 %carry = extractvalue { i32, i1 } %usub, 1 @@ -137,17 +200,15 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -193,6 +254,38 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -215,17 +308,15 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -283,6 +374,45 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_byte v0, v2, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i32_novcc: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ;;#ASMSTART +; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i32_novcc: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] dlc +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr @@ -306,21 +436,21 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_sub_u32 s6, s4, s6 -; SI-NEXT: s_subb_u32 s7, s5, s7 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_subb_u32 s7, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_mov_b32 s8, s0 +; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s3 +; SI-NEXT: s_mov_b32 s2, s10 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 -; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_i64: @@ -359,6 +489,37 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_sub_u32 s0, s12, s14 +; GFX10-NEXT: s_subb_u32 s1, s13, s15 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13] +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v2, v3, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_sub_u32 s6, s4, s6 +; GFX11-NEXT: s_subb_u32 s7, s5, s7 +; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 %carry = extractvalue { i64, i1 } %usub, 1 @@ -375,17 +536,15 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -393,8 +552,8 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 ; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -437,6 +596,42 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: global_store_byte v4, v0, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] +; GFX10-NEXT: global_store_byte v4, v0, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: global_store_b8 v4, v0, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr @@ -459,17 +654,15 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 -; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -477,8 +670,8 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 @@ -522,6 +715,42 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: global_store_short v0, v2, s[8:9] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_ushort v1, v0, s[12:13] +; GFX10-NEXT: global_load_ushort v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_nc_u32_e32 v2, v1, v2 +; GFX10-NEXT: v_cmp_gt_u32_sdwa s0, v2, v1 src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: global_store_short v0, v2, s[8:9] +; GFX10-NEXT: global_store_byte v0, v1, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_d16_b16 v1, v0, s[4:5] +; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v1, s[2:3] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr @@ -544,17 +773,15 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s14, s10 ; SI-NEXT: s_mov_b32 s15, s11 -; SI-NEXT: s_mov_b32 s18, s10 -; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s12, s4 ; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s16, s6 -; SI-NEXT: s_mov_b32 s17, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 ; SI-NEXT: s_mov_b32 s6, s10 ; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: s_mov_b32 s4, s2 @@ -606,6 +833,42 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[12:13] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v1, s0, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v4, s[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v1, s4, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: v_sub_co_u32 v0, s4, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-NEXT: s_endpgm %a = load <2 x i32>, ptr addrspace(1) %aptr, align 4 %b = load <2 x i32>, ptr addrspace(1) %bptr, align 4 %sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind @@ -623,26 +886,27 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s1 -; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_cmp_eq_u32 s0, s1 +; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 ; SI-NEXT: .LBB8_2: ; %exit ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s7 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_usubo_clamp_bit: @@ -687,6 +951,45 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: global_store_dword v1, v0, s[8:9] ; GFX9-NEXT: global_store_byte v1, v2, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: s_usubo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_sub_co_u32 v0, s1, s2, s3 +; GFX10-NEXT: s_cmp_eq_u32 s2, s3 +; GFX10-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB8_2: ; %exit +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[8:9] +; GFX10-NEXT: global_store_byte v1, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: s_usubo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_sub_co_u32 v0, s1, s2, s3 +; GFX11-NEXT: s_cmp_eq_u32 s2, s3 +; GFX11-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s0, s1, -1 +; GFX11-NEXT: .LBB8_2: ; %exit +; GFX11-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_store_b8 v1, v2, s[6:7] +; GFX11-NEXT: s_endpgm entry: %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) %val = extractvalue { i32, i1 } %usub, 0 @@ -712,19 +1015,19 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s13, s11 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 -; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 +; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if ; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 @@ -787,6 +1090,50 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX9-NEXT: global_store_byte v0, v1, s[10:11] ; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_usubo_clamp_bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dword v1, v0, s[12:13] +; GFX10-NEXT: global_load_dword v2, v0, s[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_sub_co_u32 v1, s1, v1, v2 +; GFX10-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_xor_b32 s0, s1, -1 +; GFX10-NEXT: .LBB9_2: ; %exit +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[8:9] +; GFX10-NEXT: global_store_byte v0, v2, s[10:11] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_usubo_clamp_bit: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_sub_co_u32 v1, s5, v1, v2 +; GFX11-NEXT: s_cbranch_vccnz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_xor_b32 s4, s5, -1 +; GFX11-NEXT: .LBB9_2: ; %exit +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX11-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/ARM/fp16.ll b/llvm/test/CodeGen/ARM/fp16.ll index dc35fa3..9ff7010 100644 --- a/llvm/test/CodeGen/ARM/fp16.ll +++ b/llvm/test/CodeGen/ARM/fp16.ll @@ -86,8 +86,8 @@ define i16 @test_to_fp16(double %in) { ; CHECK-FP16-SAFE: bl __aeabi_d2h -; CHECK-FP16-UNSAFE: vcvt.f32.f64 s0, d0 -; CHECK-FP16-UNSAFE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-FP16-UNSAFE: vmov r0, r1, d0 +; CHECK-FP16-UNSAFE-NEXT: bl __aeabi_d2h ; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0 ; CHECK-ARMV8: vmov r0, [[TMP]] diff --git a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll index f3a227c..2fc6790 100644 --- a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll +++ b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll @@ -22,3 +22,11 @@ define void @test() { define void @test_optsize() optsize { ret void } + +; CHECK-LABEL: test_minsize +; ALIGN-CS-16: .p2align 1 +; ALIGN-CS-32: .p2align 2 + +define void @test_minsize() minsize { + ret void +} diff --git a/llvm/test/CodeGen/BPF/BTF/map-def-2.ll b/llvm/test/CodeGen/BPF/BTF/map-def-2.ll index 5f971ec..d4c836f 100644 --- a/llvm/test/CodeGen/BPF/BTF/map-def-2.ll +++ b/llvm/test/CodeGen/BPF/BTF/map-def-2.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s -; RUN: llc -mtriple=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -mtriple=bpfel -mcpu=v3 -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s ; ; Source code: ; struct key_type { @@ -18,51 +19,17 @@ @hash_map = dso_local local_unnamed_addr global %struct.map_type zeroinitializer, section ".maps", align 8, !dbg !0 -; CHECK: .long 0 # BTF_KIND_PTR(id = 1) -; CHECK-NEXT: .long 33554432 # 0x2000000 -; CHECK-NEXT: .long 2 -; CHECK-NEXT: .long 1 # BTF_KIND_STRUCT(id = 2) -; CHECK-NEXT: .long 67108865 # 0x4000001 -; CHECK-NEXT: .long 4 -; CHECK-NEXT: .long 10 -; CHECK-NEXT: .long 3 -; CHECK-NEXT: .long 0 # 0x0 -; CHECK-NEXT: .long 13 # BTF_KIND_INT(id = 3) -; CHECK-NEXT: .long 16777216 # 0x1000000 -; CHECK-NEXT: .long 4 -; CHECK-NEXT: .long 16777248 # 0x1000020 -; CHECK-NEXT: .long 17 # BTF_KIND_TYPEDEF(id = 4) -; CHECK-NEXT: .long 134217728 # 0x8000000 -; CHECK-NEXT: .long 5 -; CHECK-NEXT: .long 28 # BTF_KIND_TYPEDEF(id = 5) -; CHECK-NEXT: .long 134217728 # 0x8000000 -; CHECK-NEXT: .long 6 -; CHECK-NEXT: .long 38 # BTF_KIND_STRUCT(id = 6) -; CHECK-NEXT: .long 67108865 # 0x4000001 -; CHECK-NEXT: .long 8 -; CHECK-NEXT: .long 47 -; CHECK-NEXT: .long 1 -; CHECK-NEXT: .long 0 # 0x0 -; CHECK-NEXT: .long 51 # BTF_KIND_VAR(id = 7) -; CHECK-NEXT: .long 234881024 # 0xe000000 -; CHECK-NEXT: .long 4 -; CHECK-NEXT: .long 1 -; CHECK-NEXT: .long 60 # BTF_KIND_DATASEC(id = 8) -; CHECK-NEXT: .long 251658241 # 0xf000001 -; CHECK-NEXT: .long 0 -; CHECK-NEXT: .long 7 -; CHECK-NEXT: .long hash_map -; CHECK-NEXT: .long 8 - -; CHECK: .ascii "key_type" # string offset=1 -; CHECK: .ascii "a1" # string offset=10 -; CHECK: .ascii "int" # string offset=13 -; CHECK: .ascii "__map_type" # string offset=17 -; CHECK: .ascii "_map_type" # string offset=28 -; CHECK: .ascii "map_type" # string offset=38 -; CHECK: .ascii "key" # string offset=47 -; CHECK: .ascii "hash_map" # string offset=51 -; CHECK: .ascii ".maps" # string offset=60 +; CHECK-BTF: [1] PTR '(anon)' type_id=2 +; CHECK-BTF-NEXT: [2] STRUCT 'key_type' size=4 vlen=1 +; CHECK-BTF-NEXT: 'a1' type_id=3 bits_offset=0 +; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED +; CHECK-BTF-NEXT: [4] STRUCT 'map_type' size=8 vlen=1 +; CHECK-BTF-NEXT: 'key' type_id=1 bits_offset=0 +; CHECK-BTF-NEXT: [5] TYPEDEF '_map_type' type_id=4 +; CHECK-BTF-NEXT: [6] TYPEDEF '__map_type' type_id=5 +; CHECK-BTF-NEXT: [7] VAR 'hash_map' type_id=6, linkage=global +; CHECK-BTF-NEXT: [8] DATASEC '.maps' size=0 vlen=1 +; CHECK-BTF-NEXT: type_id=7 offset=0 size=8 !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!16, !17, !18} diff --git a/llvm/test/CodeGen/BPF/BTF/map-def-3.ll b/llvm/test/CodeGen/BPF/BTF/map-def-3.ll index 6aa8af9..1d95f03 100644 --- a/llvm/test/CodeGen/BPF/BTF/map-def-3.ll +++ b/llvm/test/CodeGen/BPF/BTF/map-def-3.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s -; RUN: llc -mtriple=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -mtriple=bpfel -mcpu=v3 -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s ; ; Source code: ; struct key_type { @@ -13,36 +14,13 @@ @hash_map = dso_local local_unnamed_addr constant %struct.key_type zeroinitializer, section ".maps", align 4, !dbg !0 -; CHECK: .long 1 # BTF_KIND_INT(id = 1) -; CHECK-NEXT: .long 16777216 # 0x1000000 -; CHECK-NEXT: .long 4 -; CHECK-NEXT: .long 16777248 # 0x1000020 -; CHECK-NEXT: .long 0 # BTF_KIND_CONST(id = 2) -; CHECK-NEXT: .long 167772160 # 0xa000000 -; CHECK-NEXT: .long 3 -; CHECK-NEXT: .long 5 # BTF_KIND_STRUCT(id = 3) -; CHECK-NEXT: .long 67108865 # 0x4000001 -; CHECK-NEXT: .long 4 -; CHECK-NEXT: .long 14 -; CHECK-NEXT: .long 1 -; CHECK-NEXT: .long 0 # 0x0 -; CHECK-NEXT: .long 17 # BTF_KIND_VAR(id = 4) -; CHECK-NEXT: .long 234881024 # 0xe000000 -; CHECK-NEXT: .long 2 -; CHECK-NEXT: .long 1 -; CHECK-NEXT: .long 26 # BTF_KIND_DATASEC(id = 5) -; CHECK-NEXT: .long 251658241 # 0xf000001 -; CHECK-NEXT: .long 0 -; CHECK-NEXT: .long 4 -; CHECK-NEXT: .long hash_map -; CHECK-NEXT: .long 4 - -; CHECK: .ascii "int" # string offset=1 -; CHECK: .ascii "key_type" # string offset=5 -; CHECK: .ascii "a1" # string offset=14 -; CHECK: .ascii "hash_map" # string offset=17 -; CHECK: .ascii ".maps" # string offset=26 - +; CHECK-BTF: [1] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED +; CHECK-BTF-NEXT: [2] STRUCT 'key_type' size=4 vlen=1 +; CHECK-BTF-NEXT: 'a1' type_id=1 bits_offset=0 +; CHECK-BTF-NEXT: [3] CONST '(anon)' type_id=2 +; CHECK-BTF-NEXT: [4] VAR 'hash_map' type_id=3, linkage=global +; CHECK-BTF-NEXT: [5] DATASEC '.maps' size=0 vlen=1 +; CHECK-BTF-NEXT: type_id=4 offset=0 size=4 !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!11, !12, !13} diff --git a/llvm/test/CodeGen/BPF/BTF/map-def-nested-array.ll b/llvm/test/CodeGen/BPF/BTF/map-def-nested-array.ll new file mode 100644 index 0000000..fc95daf --- /dev/null +++ b/llvm/test/CodeGen/BPF/BTF/map-def-nested-array.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=bpfel -mcpu=v3 -filetype=obj -o %t1 %s +; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1 +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF-SHORT %s +; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s +; Source: +; struct nested_value_type { +; int a1; +; }; +; struct map_type { +; struct { +; struct nested_value_type *value; +; } *values[]; +; }; +; Compilation flags: +; clang -target bpf -g -O2 -S -emit-llvm prog.c + +; ModuleID = 'prog.c' +source_filename = "prog.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "bpf" + +%struct.map_type = type { [0 x ptr] } + +@array_of_maps = dso_local local_unnamed_addr global %struct.map_type zeroinitializer, section ".maps", align 8, !dbg !0 + +; We expect no forward declarations. +; +; CHECK-BTF-SHORT-NOT: FWD + +; Assert the whole BTF. +; +; CHECK-BTF: [1] PTR '(anon)' type_id=2 +; CHECK-BTF-NEXT: [2] STRUCT 'nested_value_type' size=4 vlen=1 +; CHECK-BTF-NEXT: 'a1' type_id=3 bits_offset=0 +; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED +; CHECK-BTF-NEXT: [4] STRUCT '(anon)' size=8 vlen=1 +; CHECK-BTF-NEXT: 'value' type_id=1 bits_offset=0 +; CHECK-BTF-NEXT: [5] PTR '(anon)' type_id=4 +; CHECK-BTF-NEXT: [6] ARRAY '(anon)' type_id=5 index_type_id=7 nr_elems=0 +; CHECK-BTF-NEXT: [7] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none) +; CHECK-BTF-NEXT: [8] STRUCT 'map_type' size=0 vlen=1 +; CHECK-BTF-NEXT: 'values' type_id=6 bits_offset=0 +; CHECK-BTF-NEXT: [9] VAR 'array_of_maps' type_id=8, linkage=global +; CHECK-BTF-NEXT: [10] DATASEC '.maps' size=0 vlen=1 +; CHECK-BTF-NEXT: type_id=9 offset=0 size=0 + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!20, !21, !22, !23} +!llvm.ident = !{!24} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "array_of_maps", scope: !2, file: !3, line: 9, type: !5, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 22.0.0git (git@github.com:llvm/llvm-project.git ed93eaa421b714028b85cc887d80c45991d7207f)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None) +!3 = !DIFile(filename: "prog.c", directory: "/home/mtardy/llvm-bug-repro", checksumkind: CSK_MD5, checksum: "9381d9e83e9c0b235a14704224815e96") +!4 = !{!0} +!5 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "map_type", file: !3, line: 4, elements: !6) +!6 = !{!7} +!7 = !DIDerivedType(tag: DW_TAG_member, name: "values", scope: !5, file: !3, line: 7, baseType: !8) +!8 = !DICompositeType(tag: DW_TAG_array_type, baseType: !9, elements: !18) +!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64) +!10 = distinct !DICompositeType(tag: DW_TAG_structure_type, scope: !5, file: !3, line: 5, size: 64, elements: !11) +!11 = !{!12} +!12 = !DIDerivedType(tag: DW_TAG_member, name: "value", scope: !10, file: !3, line: 6, baseType: !13, size: 64) +!13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64) +!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "nested_value_type", file: !3, line: 1, size: 32, elements: !15) +!15 = !{!16} +!16 = !DIDerivedType(tag: DW_TAG_member, name: "a1", scope: !14, file: !3, line: 2, baseType: !17, size: 32) +!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!18 = !{!19} +!19 = !DISubrange(count: -1) +!20 = !{i32 7, !"Dwarf Version", i32 5} +!21 = !{i32 2, !"Debug Info Version", i32 3} +!22 = !{i32 1, !"wchar_size", i32 4} +!23 = !{i32 7, !"frame-pointer", i32 2} +!24 = !{!"clang version 22.0.0git (git@github.com:llvm/llvm-project.git ed93eaa421b714028b85cc887d80c45991d7207f)"} diff --git a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll index 16cc1f3..e5a6aa4 100644 --- a/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll +++ b/llvm/test/CodeGen/Hexagon/hvx-reuse-fi-base.ll @@ -183,7 +183,7 @@ b0: %v11 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v10, <32 x i32> undef) %v12 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v11, i32 2147483647, i32 1) store <64 x i32> %v12, ptr @g0, align 128 - call void (ptr, ...) @f1(ptr @g3) #2 + call void (ptr, ...) @f1(ptr @g3) #3 %v13 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2) %v14 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v13) %v15 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v14, i32 -2147483648, i32 1) @@ -193,7 +193,7 @@ b0: %v17 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> undef, <32 x i32> %v16) %v18 = call <64 x i32> @llvm.hexagon.V6.vrmpyubi.128B(<64 x i32> %v17, i32 0, i32 1) store <64 x i32> %v18, ptr @g0, align 128 - call void @f0() #2 + call void @f0() #3 %v19 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1) %v20 = call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 2) %v21 = call <64 x i32> @llvm.hexagon.V6.vaddubh.128B(<32 x i32> %v19, <32 x i32> %v20) @@ -205,3 +205,4 @@ b0: attributes #0 = { nounwind "use-soft-float"="false" "target-cpu"="hexagonv66" "target-features"="+hvxv66,+hvx-length128b" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind optsize } +attributes #3 = { nounwind minsize } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-expect-id.mir b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-expect-id.mir new file mode 100644 index 0000000..4179ff2 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-expect-id.mir @@ -0,0 +1,29 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: not llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s + +--- | + define void @expect_id(ptr %ptr, float %data) #0 { + %1 = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0 + ret void + } + + attributes #0 = { "target-cpu"="gfx1200" } + + !0 = !{i32 5, i32 6} +... + +--- +name: expect_id + +body: | + bb.1 (%ir-block.0): + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK: expected metadata id after '!' + %2:vgpr_32 = COPY $vgpr0 + %3:vgpr_32 = COPY $vgpr1 + %0:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %1:vgpr_32 = COPY $vgpr2 + FLAT_ATOMIC_ADD_F32 %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !!) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-parse.mir b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-parse.mir new file mode 100644 index 0000000..7fe6aa9 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-parse.mir @@ -0,0 +1,36 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=none -o - %s | FileCheck %s + + +--- | + define void @test_parsing_printing(ptr %ptr, float %data) { + %1 = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0 + ret void + } + + !0 = !{i32 5, i32 6} +... + +--- +name: test_parsing_printing + +body: | + bb.1 (%ir-block.0): + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_parsing_printing + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0) + ; CHECK-NEXT: S_ENDPGM 0 + %2:vgpr_32 = COPY $vgpr0 + %3:vgpr_32 = COPY $vgpr1 + %0:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %1:vgpr_32 = COPY $vgpr2 + FLAT_ATOMIC_ADD_F32 %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !0) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-undefine-matadata.mir b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-undefine-matadata.mir new file mode 100644 index 0000000..505b514 --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/noalias-addrspace-undefine-matadata.mir @@ -0,0 +1,28 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: not llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s + + +--- | + define void @undefined_metadata(ptr %ptr, float %data) { + %1 = atomicrmw fadd ptr %ptr, float %data syncscope("agent") seq_cst, align 4, !noalias.addrspace !0 + ret void + } + + !0 = !{i32 5, i32 6} +... + +--- +name: undefined_metadata + +body: | + bb.1 (%ir-block.0): + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK: use of undefined metadata '!3' + %2:vgpr_32 = COPY $vgpr0 + %3:vgpr_32 = COPY $vgpr1 + %0:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %3, %subreg.sub1 + %1:vgpr_32 = COPY $vgpr2 + FLAT_ATOMIC_ADD_F32 %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst (s32) on %ir.ptr, !noalias.addrspace !3) + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir new file mode 100644 index 0000000..cb78898 --- /dev/null +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-ambiguous-indirect-call-typeid.mir @@ -0,0 +1,31 @@ +# Test MIR printer and parser to check if a call instruction with multiple +# callee types are handled correctly. + +# RUN: llc -mtriple=x86_64 --call-graph-section %s -run-pass=none -o - | FileCheck --match-full-lines %s +# CHECK: name: ambiguous_caller +# CHECK: callSites: +# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: {{.*}}, calleeTypeIds: +# CHECK-NEXT: [ 1234, 5678 ] } + +--- | + define ptr @ambiguous_caller() { + entry: + %fn = alloca ptr, align 8 + %call1 = call ptr %fn(i64 4), !callee_type !0 + ret ptr %call1 + } + + !0 = !{!1, !2} + !1 = !{i64 0, !"callee_type0.generalized"} + !2 = !{i64 0, !"callee_type2.generalized"} +... +--- +name: ambiguous_caller +callSites: + - { bb: 0, offset: 1, fwdArgRegs: [], calleeTypeIds: [ 1234, 5678 ] } +body: | + bb.0.entry: + %0:gr64 = MOV32ri64 4 + CALL64r killed %0, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + RET 0, $rax +... diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-direct-calls-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-direct-calls-typeid.mir new file mode 100644 index 0000000..faa021c --- /dev/null +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-direct-calls-typeid.mir @@ -0,0 +1,54 @@ +# Test MIR printer and parser to NOT have `CalleeTypeIds` field in callSites. +# `CalleeTypeId` is used for propagating call site type identifiers for +# indirect targets only. This test does not contain any indirect targets. + +# RUN: llc -mtriple=x86_64 --call-graph-section %s -run-pass=none -o - | FileCheck --match-full-lines %s +# CHECK-NOT: calleeTypeIds +# CHECK: name: bar +# CHECK: callSites: +# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [] } +# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [] } +# CHECK: name: foo +# CHECK: callSites: +# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [] } + +--- | + declare i32 @fizz(i32, i32) + + declare i32 @buzz(i32, i32) + + define i32 @bar(i32 %x, i32 %y) !type !0 { + entry: + %call = call i32 @buzz(i32 %x, i32 %x) + %call1 = call i32 @fizz(i32 %x, i32 %x) + ret i32 0 + } + + define i32 @foo(i32 %x, i32 %y) !type !0 { + entry: + %call1 = call i32 @bar(i32 %x, i32 %x) + ret i32 0 + } + + !0 = !{i64 0, !"_ZTSFiiiE.generalized"} +... +--- +name: bar +callSites: + - { bb: 0, offset: 0, fwdArgRegs: [] } + - { bb: 0, offset: 1, fwdArgRegs: [] } +body: | + bb.0.entry: + CALL64pcrel32 target-flags(x86-plt) @buzz, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax + CALL64pcrel32 target-flags(x86-plt) @fizz, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax + +... +--- +name: foo +callSites: + - { bb: 0, offset: 0, fwdArgRegs: [] } +body: | + bb.0.entry: + CALL64pcrel32 target-flags(x86-plt) @bar, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax + +... diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir new file mode 100644 index 0000000..303b8fa --- /dev/null +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-typeid.mir @@ -0,0 +1,28 @@ +# Test MIR printer and parser for type id field in callSites. It is used +# for propagating call site type identifiers to emit in the call graph section. + +# RUN: llc -mtriple=x86_64 --call-graph-section %s -run-pass=none -o - | FileCheck --match-full-lines %s +# CHECK: name: call_foo +# CHECK: callSites: +# CHECK-NEXT: - { bb: {{.*}}, offset: {{.*}}, fwdArgRegs: [], calleeTypeIds: +# CHECK-NEXT: [ 123456789 ] } + +--- | + define i32 @call_foo() { + entry: + %0 = load ptr, ptr null, align 8 + call void %0(i8 0), !callee_type !0 + ret i32 0 + } + + !0 = !{!1} + !1 = !{i64 0, !"_ZTSFvcE.generalized"} +... +--- +name: call_foo +callSites: + - { bb: 0, offset: 0, fwdArgRegs: [], calleeTypeIds: [ 123456789 ] } +body: | + bb.0.entry: + CALL64m $noreg, 1, $noreg, 0, $noreg, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit-def $rsp, implicit-def $ssp :: (load (s64) from `ptr null`) +... diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index 7f52e52..abc873e 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -16,8 +16,8 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: call.uni (retval0), barv, (param0); ; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; ; CHECK-NEXT: } // callseq 0 @@ -32,24 +32,24 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { define void @test_v3f32(<3 x float> %input, ptr %output) { ; CHECK-LABEL: test_v3f32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<10>; -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_v3f32_param_0]; -; CHECK-NEXT: ld.param.b32 %r3, [test_v3f32_param_0+8]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; -; CHECK-NEXT: st.param.b32 [param0+8], %r3; ; CHECK-NEXT: .param .align 16 .b8 retval0[16]; +; CHECK-NEXT: st.param.b32 [param0+8], %r1; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: call.uni (retval0), barv3, (param0); -; CHECK-NEXT: ld.param.v2.b32 {%r4, %r5}, [retval0]; -; CHECK-NEXT: ld.param.b32 %r6, [retval0+8]; +; CHECK-NEXT: ld.param.b32 %r2, [retval0+8]; +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_1]; -; CHECK-NEXT: st.v2.b32 [%rd1], {%r4, %r5}; -; CHECK-NEXT: st.b32 [%rd1+8], %r6; +; CHECK-NEXT: ld.param.b64 %rd4, [test_v3f32_param_1]; +; CHECK-NEXT: st.b32 [%rd4+8], %r2; +; CHECK-NEXT: st.b64 [%rd4], %rd2; ; CHECK-NEXT: ret; %call = tail call <3 x float> @barv3(<3 x float> %input) ; Make sure we don't load more values than than we need to. @@ -68,16 +68,16 @@ define void @test_a2f32([2 x float] %input, ptr %output) { ; CHECK-NEXT: ld.param.b32 %r2, [test_a2f32_param_0+4]; ; CHECK-NEXT: { // callseq 2, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[8]; -; CHECK-NEXT: st.param.b32 [param0], %r1; -; CHECK-NEXT: st.param.b32 [param0+4], %r2; ; CHECK-NEXT: .param .align 4 .b8 retval0[8]; +; CHECK-NEXT: st.param.b32 [param0+4], %r2; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), bara, (param0); -; CHECK-NEXT: ld.param.b32 %r3, [retval0]; -; CHECK-NEXT: ld.param.b32 %r4, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r3, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r4, [retval0]; ; CHECK-NEXT: } // callseq 2 ; CHECK-NEXT: ld.param.b64 %rd1, [test_a2f32_param_1]; -; CHECK-NEXT: st.b32 [%rd1+4], %r4; -; CHECK-NEXT: st.b32 [%rd1], %r3; +; CHECK-NEXT: st.b32 [%rd1+4], %r3; +; CHECK-NEXT: st.b32 [%rd1], %r4; ; CHECK-NEXT: ret; %call = tail call [2 x float] @bara([2 x float] %input) store [2 x float] %call, ptr %output, align 4 @@ -95,16 +95,16 @@ define void @test_s2f32({float, float} %input, ptr %output) { ; CHECK-NEXT: ld.param.b32 %r2, [test_s2f32_param_0+4]; ; CHECK-NEXT: { // callseq 3, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[8]; -; CHECK-NEXT: st.param.b32 [param0], %r1; -; CHECK-NEXT: st.param.b32 [param0+4], %r2; ; CHECK-NEXT: .param .align 4 .b8 retval0[8]; +; CHECK-NEXT: st.param.b32 [param0+4], %r2; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), bars, (param0); -; CHECK-NEXT: ld.param.b32 %r3, [retval0]; -; CHECK-NEXT: ld.param.b32 %r4, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r3, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r4, [retval0]; ; CHECK-NEXT: } // callseq 3 ; CHECK-NEXT: ld.param.b64 %rd1, [test_s2f32_param_1]; -; CHECK-NEXT: st.b32 [%rd1+4], %r4; -; CHECK-NEXT: st.b32 [%rd1], %r3; +; CHECK-NEXT: st.b32 [%rd1+4], %r3; +; CHECK-NEXT: st.b32 [%rd1], %r4; ; CHECK-NEXT: ret; %call = tail call {float, float} @bars({float, float} %input) store {float, float} %call, ptr %output, align 4 diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index ba5813c..b4641d0 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -208,13 +208,13 @@ define <2 x bfloat> @test_call(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1], %r2; ; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; CHECK-NEXT: st.param.b32 [param1], %r2; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); ; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 0 diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll index ad9e4b0..b4934e1a 100644 --- a/llvm/test/CodeGen/NVPTX/byval-const-global.ll +++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll @@ -13,12 +13,12 @@ define void @foo() { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.b64 %rd1, [G]; -; CHECK-NEXT: ld.global.b64 %rd2, [G+8]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[16]; -; CHECK-NEXT: st.param.b64 [param0], %rd1; -; CHECK-NEXT: st.param.b64 [param0+8], %rd2; +; CHECK-NEXT: ld.global.b64 %rd1, [G+8]; +; CHECK-NEXT: st.param.b64 [param0+8], %rd1; +; CHECK-NEXT: ld.global.b64 %rd2, [G]; +; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: call.uni bar, (param0); ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 0cd7058..0eb7f64 100644 --- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -44,11 +44,11 @@ entry: %arrayidx7 = getelementptr inbounds [16 x i8], ptr %buf, i64 0, i64 3 store float %3, ptr %arrayidx7, align 4 -; CHECK: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0], %rd[[A_REG]] -; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1], %rd[[SP_REG]] -; CHECK-NEXT: call.uni callee, +; CHECK-DAG: .param .b64 param0; +; CHECK-DAG: .param .b64 param1; +; CHECK-DAG: st.param.b64 [param0], %rd[[A_REG]] +; CHECK-DAG: st.param.b64 [param1], %rd[[SP_REG]] +; CHECK: call.uni callee, call void @callee(ptr %a, ptr %buf) #2 ret void diff --git a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll index f67145d..483d48a 100644 --- a/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll +++ b/llvm/test/CodeGen/NVPTX/call_bitcast_byval.ll @@ -14,11 +14,11 @@ target triple = "nvptx64-nvidia-cuda" %complex_half = type { half, half } ; CHECK: .param .align 2 .b8 param2[4]; -; CHECK: st.param.b16 [param2], %rs1; -; CHECK: st.param.b16 [param2+2], %rs2; ; CHECK: .param .align 2 .b8 retval0[4]; -; CHECK-NEXT: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]); -; CHECK-NEXT: call (retval0), +; CHECK-DAG: st.param.b16 [param2], %rs{{[0-9]+}}; +; CHECK-DAG: st.param.b16 [param2+2], %rs{{[0-9]+}}; +; CHECK: prototype_0 : .callprototype (.param .align 2 .b8 _[4]) _ (.param .b32 _, .param .b32 _, .param .align 2 .b8 _[4]); +; CHECK: call (retval0), define weak_odr void @foo() { entry: %call.i.i.i = tail call %"class.complex" @_Z20__spirv_GroupCMulKHRjjN5__spv12complex_halfE(i32 0, i32 0, ptr byval(%"class.complex") null) @@ -36,10 +36,10 @@ define internal void @callee(ptr byval(%"class.complex") %byval_arg) { } define void @boom() { %fp = call ptr @usefp(ptr @callee) - ; CHECK: .param .align 2 .b8 param0[4]; - ; CHECK: st.param.b16 [param0], %rs1; - ; CHECK: st.param.b16 [param0+2], %rs2; - ; CHECK: .callprototype ()_ (.param .align 2 .b8 _[4]); + ; CHECK-DAG: .param .align 2 .b8 param0[4]; + ; CHECK-DAG: st.param.b16 [param0], %rs{{[0-9]+}}; + ; CHECK-DAG: st.param.b16 [param0+2], %rs{{[0-9]+}}; + ; CHECK-DAG: .callprototype ()_ (.param .align 2 .b8 _[4]); call void %fp(ptr byval(%"class.complex") null) ret void } diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll index 2232810..da303b7 100644 --- a/llvm/test/CodeGen/NVPTX/combine-mad.ll +++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll @@ -199,10 +199,10 @@ define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: add.s32 %r5, %r3, %r4; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b32 param0; -; CHECK-NEXT: st.param.b32 [param0], %r3; ; CHECK-NEXT: .param .b32 param1; -; CHECK-NEXT: st.param.b32 [param1], %r5; ; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: st.param.b32 [param0], %r3; +; CHECK-NEXT: st.param.b32 [param1], %r5; ; CHECK-NEXT: call.uni (retval0), use, (param0, param1); ; CHECK-NEXT: ld.param.b32 %r6, [retval0]; ; CHECK-NEXT: } // callseq 0 diff --git a/llvm/test/CodeGen/NVPTX/compare-int.ll b/llvm/test/CodeGen/NVPTX/compare-int.ll index b44ae47..9338172d 100644 --- a/llvm/test/CodeGen/NVPTX/compare-int.ll +++ b/llvm/test/CodeGen/NVPTX/compare-int.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 | FileCheck %s ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 | %ptxas-verify %} @@ -11,90 +12,180 @@ ;;; i64 define i64 @icmp_eq_i64(i64 %a, i64 %b) { -; CHECK: setp.eq.b64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_eq_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_eq_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_eq_i64_param_1]; +; CHECK-NEXT: setp.eq.b64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp eq i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_ne_i64(i64 %a, i64 %b) { -; CHECK: setp.ne.b64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ne_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ne_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ne_i64_param_1]; +; CHECK-NEXT: setp.ne.b64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp ne i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_ugt_i64(i64 %a, i64 %b) { -; CHECK: setp.gt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ugt_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ugt_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ugt_i64_param_1]; +; CHECK-NEXT: setp.gt.u64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp ugt i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_uge_i64(i64 %a, i64 %b) { -; CHECK: setp.ge.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_uge_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_uge_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_uge_i64_param_1]; +; CHECK-NEXT: setp.ge.u64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp uge i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_ult_i64(i64 %a, i64 %b) { -; CHECK: setp.lt.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ult_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ult_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ult_i64_param_1]; +; CHECK-NEXT: setp.lt.u64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp ult i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_ule_i64(i64 %a, i64 %b) { -; CHECK: setp.le.u64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ule_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_ule_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_ule_i64_param_1]; +; CHECK-NEXT: setp.le.u64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp ule i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_sgt_i64(i64 %a, i64 %b) { -; CHECK: setp.gt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sgt_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_sgt_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_sgt_i64_param_1]; +; CHECK-NEXT: setp.gt.s64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp sgt i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_sge_i64(i64 %a, i64 %b) { -; CHECK: setp.ge.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sge_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_sge_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_sge_i64_param_1]; +; CHECK-NEXT: setp.ge.s64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp sge i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_slt_i64(i64 %a, i64 %b) { -; CHECK: setp.lt.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_slt_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_slt_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_slt_i64_param_1]; +; CHECK-NEXT: setp.lt.s64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp slt i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret } define i64 @icmp_sle_i64(i64 %a, i64 %b) { -; CHECK: setp.le.s64 %p[[P0:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} -; CHECK: selp.b64 %rd{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sle_i64( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [icmp_sle_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [icmp_sle_i64_param_1]; +; CHECK-NEXT: setp.le.s64 %p1, %rd1, %rd2; +; CHECK-NEXT: selp.b64 %rd3, 1, 0, %p1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %cmp = icmp sle i64 %a, %b %ret = zext i1 %cmp to i64 ret i64 %ret @@ -103,90 +194,180 @@ define i64 @icmp_sle_i64(i64 %a, i64 %b) { ;;; i32 define i32 @icmp_eq_i32(i32 %a, i32 %b) { -; CHECK: setp.eq.b32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_eq_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_eq_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_eq_i32_param_1]; +; CHECK-NEXT: setp.eq.b32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp eq i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_ne_i32(i32 %a, i32 %b) { -; CHECK: setp.ne.b32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ne_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_ne_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_ne_i32_param_1]; +; CHECK-NEXT: setp.ne.b32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp ne i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_ugt_i32(i32 %a, i32 %b) { -; CHECK: setp.gt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ugt_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_ugt_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_ugt_i32_param_1]; +; CHECK-NEXT: setp.gt.u32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp ugt i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_uge_i32(i32 %a, i32 %b) { -; CHECK: setp.ge.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_uge_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_uge_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_uge_i32_param_1]; +; CHECK-NEXT: setp.ge.u32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp uge i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_ult_i32(i32 %a, i32 %b) { -; CHECK: setp.lt.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ult_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_ult_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_ult_i32_param_1]; +; CHECK-NEXT: setp.lt.u32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp ult i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_ule_i32(i32 %a, i32 %b) { -; CHECK: setp.le.u32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ule_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_ule_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_ule_i32_param_1]; +; CHECK-NEXT: setp.le.u32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp ule i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_sgt_i32(i32 %a, i32 %b) { -; CHECK: setp.gt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sgt_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_sgt_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_sgt_i32_param_1]; +; CHECK-NEXT: setp.gt.s32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp sgt i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_sge_i32(i32 %a, i32 %b) { -; CHECK: setp.ge.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sge_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_sge_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_sge_i32_param_1]; +; CHECK-NEXT: setp.ge.s32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp sge i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_slt_i32(i32 %a, i32 %b) { -; CHECK: setp.lt.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_slt_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_slt_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_slt_i32_param_1]; +; CHECK-NEXT: setp.lt.s32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp slt i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret } define i32 @icmp_sle_i32(i32 %a, i32 %b) { -; CHECK: setp.le.s32 %p[[P0:[0-9]+]], %r{{[0-9]+}}, %r{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sle_i32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [icmp_sle_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_sle_i32_param_1]; +; CHECK-NEXT: setp.le.s32 %p1, %r1, %r2; +; CHECK-NEXT: selp.b32 %r3, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %cmp = icmp sle i32 %a, %b %ret = zext i1 %cmp to i32 ret i32 %ret @@ -196,90 +377,190 @@ define i32 @icmp_sle_i32(i32 %a, i32 %b) { ;;; i16 define i16 @icmp_eq_i16(i16 %a, i16 %b) { -; CHECK: setp.eq.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_eq_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_eq_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_eq_i16_param_1]; +; CHECK-NEXT: setp.eq.b16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp eq i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_ne_i16(i16 %a, i16 %b) { -; CHECK: setp.ne.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ne_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ne_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ne_i16_param_1]; +; CHECK-NEXT: setp.ne.b16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ne i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_ugt_i16(i16 %a, i16 %b) { -; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ugt_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ugt_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ugt_i16_param_1]; +; CHECK-NEXT: setp.gt.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ugt i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_uge_i16(i16 %a, i16 %b) { -; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_uge_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_uge_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_uge_i16_param_1]; +; CHECK-NEXT: setp.ge.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp uge i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_ult_i16(i16 %a, i16 %b) { -; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ult_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ult_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ult_i16_param_1]; +; CHECK-NEXT: setp.lt.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ult i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_ule_i16(i16 %a, i16 %b) { -; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ule_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_ule_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_ule_i16_param_1]; +; CHECK-NEXT: setp.le.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ule i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_sgt_i16(i16 %a, i16 %b) { -; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sgt_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_sgt_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_sgt_i16_param_1]; +; CHECK-NEXT: setp.gt.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp sgt i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_sge_i16(i16 %a, i16 %b) { -; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sge_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_sge_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_sge_i16_param_1]; +; CHECK-NEXT: setp.ge.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp sge i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_slt_i16(i16 %a, i16 %b) { -; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_slt_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_slt_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_slt_i16_param_1]; +; CHECK-NEXT: setp.lt.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp slt i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret } define i16 @icmp_sle_i16(i16 %a, i16 %b) { -; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sle_i16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [icmp_sle_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [icmp_sle_i16_param_1]; +; CHECK-NEXT: setp.le.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp sle i16 %a, %b %ret = zext i1 %cmp to i16 ret i16 %ret @@ -290,9 +571,19 @@ define i16 @icmp_sle_i16(i16 %a, i16 %b) { define i8 @icmp_eq_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.eq.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_eq_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [icmp_eq_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [icmp_eq_i8_param_1]; +; CHECK-NEXT: setp.eq.b16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp eq i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -300,9 +591,19 @@ define i8 @icmp_eq_i8(i8 %a, i8 %b) { define i8 @icmp_ne_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ne.b16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ne_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ne_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ne_i8_param_1]; +; CHECK-NEXT: setp.ne.b16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ne i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -310,9 +611,19 @@ define i8 @icmp_ne_i8(i8 %a, i8 %b) { define i8 @icmp_ugt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ugt_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ugt_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ugt_i8_param_1]; +; CHECK-NEXT: setp.gt.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ugt i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -320,9 +631,19 @@ define i8 @icmp_ugt_i8(i8 %a, i8 %b) { define i8 @icmp_uge_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_uge_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [icmp_uge_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [icmp_uge_i8_param_1]; +; CHECK-NEXT: setp.ge.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp uge i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -330,9 +651,19 @@ define i8 @icmp_uge_i8(i8 %a, i8 %b) { define i8 @icmp_ult_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ult_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ult_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ult_i8_param_1]; +; CHECK-NEXT: setp.lt.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ult i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -340,9 +671,19 @@ define i8 @icmp_ult_i8(i8 %a, i8 %b) { define i8 @icmp_ule_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_ule_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [icmp_ule_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [icmp_ule_i8_param_1]; +; CHECK-NEXT: setp.le.u16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp ule i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -350,9 +691,19 @@ define i8 @icmp_ule_i8(i8 %a, i8 %b) { define i8 @icmp_sgt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sgt_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.s8 %rs1, [icmp_sgt_i8_param_0]; +; CHECK-NEXT: ld.param.s8 %rs2, [icmp_sgt_i8_param_1]; +; CHECK-NEXT: setp.gt.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp sgt i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -360,9 +711,19 @@ define i8 @icmp_sgt_i8(i8 %a, i8 %b) { define i8 @icmp_sge_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sge_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.s8 %rs1, [icmp_sge_i8_param_0]; +; CHECK-NEXT: ld.param.s8 %rs2, [icmp_sge_i8_param_1]; +; CHECK-NEXT: setp.ge.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp sge i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -370,9 +731,19 @@ define i8 @icmp_sge_i8(i8 %a, i8 %b) { define i8 @icmp_slt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_slt_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.s8 %rs1, [icmp_slt_i8_param_0]; +; CHECK-NEXT: ld.param.s8 %rs2, [icmp_slt_i8_param_1]; +; CHECK-NEXT: setp.lt.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp slt i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret @@ -380,9 +751,19 @@ define i8 @icmp_slt_i8(i8 %a, i8 %b) { define i8 @icmp_sle_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0, %p[[P0]] -; CHECK: ret +; CHECK-LABEL: icmp_sle_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.s8 %rs1, [icmp_sle_i8_param_0]; +; CHECK-NEXT: ld.param.s8 %rs2, [icmp_sle_i8_param_1]; +; CHECK-NEXT: setp.le.s16 %p1, %rs1, %rs2; +; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %cmp = icmp sle i8 %a, %b %ret = zext i1 %cmp to i8 ret i8 %ret diff --git a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll index d1b478d..48209a8 100644 --- a/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll +++ b/llvm/test/CodeGen/NVPTX/convert-call-to-indirect.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 | %ptxas-verify -arch=sm_90 %} @@ -7,52 +8,203 @@ declare i64 @callee_variadic(ptr %p, ...); define %struct.64 @test_return_type_mismatch(ptr %p) { ; CHECK-LABEL: test_return_type_mismatch( -; CHECK: .param .align 1 .b8 retval0[8]; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<40>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_return_type_mismatch_param_0]; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: .param .align 1 .b8 retval0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: prototype_0 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _); -; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0), prototype_0; +; CHECK-NEXT: mov.b64 %rd1, callee; +; CHECK-NEXT: call (retval0), %rd1, (param0), prototype_0; +; CHECK-NEXT: ld.param.b8 %rd3, [retval0+7]; +; CHECK-NEXT: ld.param.b8 %rd4, [retval0+6]; +; CHECK-NEXT: ld.param.b8 %rd5, [retval0+5]; +; CHECK-NEXT: ld.param.b8 %rd6, [retval0+4]; +; CHECK-NEXT: ld.param.b8 %rd7, [retval0+3]; +; CHECK-NEXT: ld.param.b8 %rd8, [retval0+2]; +; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1]; +; CHECK-NEXT: ld.param.b8 %rd10, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: shl.b64 %rd13, %rd9, 8; +; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10; +; CHECK-NEXT: shl.b64 %rd16, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd18, %rd7, 24; +; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14; +; CHECK-NEXT: shl.b64 %rd23, %rd5, 8; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6; +; CHECK-NEXT: shl.b64 %rd26, %rd4, 16; +; CHECK-NEXT: shl.b64 %rd28, %rd3, 24; +; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26; +; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24; +; CHECK-NEXT: shl.b64 %rd31, %rd30, 32; +; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20; +; CHECK-NEXT: st.param.b8 [func_retval0], %rd10; +; CHECK-NEXT: shr.u64 %rd33, %rd32, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33; +; CHECK-NEXT: shr.u64 %rd34, %rd32, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34; +; CHECK-NEXT: shr.u64 %rd35, %rd32, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35; +; CHECK-NEXT: shr.u64 %rd36, %rd32, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36; +; CHECK-NEXT: shr.u64 %rd37, %rd32, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37; +; CHECK-NEXT: shr.u64 %rd38, %rd32, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38; +; CHECK-NEXT: shr.u64 %rd39, %rd32, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39; +; CHECK-NEXT: ret; %ret = call %struct.64 @callee(ptr %p) ret %struct.64 %ret } define i64 @test_param_type_mismatch(ptr %p) { ; CHECK-LABEL: test_param_type_mismatch( -; CHECK: .param .b64 retval0; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: .param .b64 retval0; ; CHECK-NEXT: prototype_1 : .callprototype (.param .b64 _) _ (.param .b64 _); -; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0), prototype_1; +; CHECK-NEXT: st.param.b64 [param0], 7; +; CHECK-NEXT: mov.b64 %rd1, callee; +; CHECK-NEXT: call (retval0), %rd1, (param0), prototype_1; +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %ret = call i64 @callee(i64 7) ret i64 %ret } define i64 @test_param_count_mismatch(ptr %p) { ; CHECK-LABEL: test_param_count_mismatch( -; CHECK: .param .b64 retval0; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_param_count_mismatch_param_0]; +; CHECK-NEXT: { // callseq 2, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: .param .b64 param1; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: prototype_2 : .callprototype (.param .b64 _) _ (.param .b64 _, .param .b64 _); -; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0, param1), prototype_2; +; CHECK-NEXT: st.param.b64 [param1], 7; +; CHECK-NEXT: mov.b64 %rd1, callee; +; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_2; +; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; +; CHECK-NEXT: } // callseq 2 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: ret; %ret = call i64 @callee(ptr %p, i64 7) ret i64 %ret } define %struct.64 @test_return_type_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_return_type_mismatch_variadic( -; CHECK: .param .align 1 .b8 retval0[8]; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<40>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_return_type_mismatch_variadic_param_0]; +; CHECK-NEXT: { // callseq 3, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: .param .align 1 .b8 retval0[8]; +; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: prototype_3 : .callprototype (.param .align 1 .b8 _[8]) _ (.param .b64 _); -; CHECK-NEXT: call (retval0), %rd{{[0-9]+}}, (param0), prototype_3; +; CHECK-NEXT: mov.b64 %rd1, callee_variadic; +; CHECK-NEXT: call (retval0), %rd1, (param0), prototype_3; +; CHECK-NEXT: ld.param.b8 %rd3, [retval0+7]; +; CHECK-NEXT: ld.param.b8 %rd4, [retval0+6]; +; CHECK-NEXT: ld.param.b8 %rd5, [retval0+5]; +; CHECK-NEXT: ld.param.b8 %rd6, [retval0+4]; +; CHECK-NEXT: ld.param.b8 %rd7, [retval0+3]; +; CHECK-NEXT: ld.param.b8 %rd8, [retval0+2]; +; CHECK-NEXT: ld.param.b8 %rd9, [retval0+1]; +; CHECK-NEXT: ld.param.b8 %rd10, [retval0]; +; CHECK-NEXT: } // callseq 3 +; CHECK-NEXT: shl.b64 %rd13, %rd9, 8; +; CHECK-NEXT: or.b64 %rd14, %rd13, %rd10; +; CHECK-NEXT: shl.b64 %rd16, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd18, %rd7, 24; +; CHECK-NEXT: or.b64 %rd19, %rd18, %rd16; +; CHECK-NEXT: or.b64 %rd20, %rd19, %rd14; +; CHECK-NEXT: shl.b64 %rd23, %rd5, 8; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd6; +; CHECK-NEXT: shl.b64 %rd26, %rd4, 16; +; CHECK-NEXT: shl.b64 %rd28, %rd3, 24; +; CHECK-NEXT: or.b64 %rd29, %rd28, %rd26; +; CHECK-NEXT: or.b64 %rd30, %rd29, %rd24; +; CHECK-NEXT: shl.b64 %rd31, %rd30, 32; +; CHECK-NEXT: or.b64 %rd32, %rd31, %rd20; +; CHECK-NEXT: st.param.b8 [func_retval0], %rd10; +; CHECK-NEXT: shr.u64 %rd33, %rd32, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %rd33; +; CHECK-NEXT: shr.u64 %rd34, %rd32, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %rd34; +; CHECK-NEXT: shr.u64 %rd35, %rd32, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %rd35; +; CHECK-NEXT: shr.u64 %rd36, %rd32, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rd36; +; CHECK-NEXT: shr.u64 %rd37, %rd32, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rd37; +; CHECK-NEXT: shr.u64 %rd38, %rd32, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rd38; +; CHECK-NEXT: shr.u64 %rd39, %rd32, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+1], %rd39; +; CHECK-NEXT: ret; %ret = call %struct.64 (ptr, ...) @callee_variadic(ptr %p) ret %struct.64 %ret } define i64 @test_param_type_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_param_type_mismatch_variadic( -; CHECK: .param .b64 retval0; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_param_type_mismatch_variadic_param_0]; +; CHECK-NEXT: { // callseq 4, 0 +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: st.param.b64 [param1], 7; ; CHECK-NEXT: call.uni (retval0), callee_variadic, (param0, param1); +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 4 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7) ret i64 %ret } define i64 @test_param_count_mismatch_variadic(ptr %p) { ; CHECK-LABEL: test_param_count_mismatch_variadic( -; CHECK: .param .b64 retval0; +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd1, [test_param_count_mismatch_variadic_param_0]; +; CHECK-NEXT: { // callseq 5, 0 +; CHECK-NEXT: .param .align 8 .b8 param1[8]; +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: st.param.b64 [param0], %rd1; +; CHECK-NEXT: st.param.b64 [param1], 7; ; CHECK-NEXT: call.uni (retval0), callee_variadic, (param0, param1); +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 5 +; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; +; CHECK-NEXT: ret; %ret = call i64 (ptr, ...) @callee_variadic(ptr %p, i64 7) ret i64 %ret } diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index 4d2ba7d..06fb8d2 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -22,8 +22,8 @@ define i32 @test_dynamic_stackalloc(i64 %n) { ; CHECK-32-NEXT: cvta.local.u32 %r5, %r4; ; CHECK-32-NEXT: { // callseq 0, 0 ; CHECK-32-NEXT: .param .b32 param0; -; CHECK-32-NEXT: st.param.b32 [param0], %r5; ; CHECK-32-NEXT: .param .b32 retval0; +; CHECK-32-NEXT: st.param.b32 [param0], %r5; ; CHECK-32-NEXT: call.uni (retval0), bar, (param0); ; CHECK-32-NEXT: ld.param.b32 %r6, [retval0]; ; CHECK-32-NEXT: } // callseq 0 @@ -43,8 +43,8 @@ define i32 @test_dynamic_stackalloc(i64 %n) { ; CHECK-64-NEXT: cvta.local.u64 %rd5, %rd4; ; CHECK-64-NEXT: { // callseq 0, 0 ; CHECK-64-NEXT: .param .b64 param0; -; CHECK-64-NEXT: st.param.b64 [param0], %rd5; ; CHECK-64-NEXT: .param .b32 retval0; +; CHECK-64-NEXT: st.param.b64 [param0], %rd5; ; CHECK-64-NEXT: call.uni (retval0), bar, (param0); ; CHECK-64-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-64-NEXT: } // callseq 0 diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 8918fbd..d4fcea3 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -462,10 +462,10 @@ define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1], %r2; ; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: st.param.b32 [param1], %r2; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); ; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 0 @@ -485,10 +485,10 @@ define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0], %r2; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1], %r1; ; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: st.param.b32 [param1], %r1; +; CHECK-NEXT: st.param.b32 [param0], %r2; ; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); ; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 1 @@ -508,10 +508,10 @@ define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; ; CHECK-NEXT: { // callseq 2, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0], %r2; ; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1], %r1; ; CHECK-NEXT: .param .align 4 .b8 retval0[4]; +; CHECK-NEXT: st.param.b32 [param1], %r1; +; CHECK-NEXT: st.param.b32 [param0], %r2; ; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); ; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 2 diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 30afd69..b84a0ec 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -859,10 +859,10 @@ define <2 x float> @test_call(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: ld.param.b64 %rd1, [test_call_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd2; ; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd2; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); ; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; ; CHECK-NEXT: } // callseq 0 @@ -882,10 +882,10 @@ define <2 x float> @test_call_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: ld.param.b64 %rd1, [test_call_flipped_param_0]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); ; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; ; CHECK-NEXT: } // callseq 1 @@ -905,10 +905,10 @@ define <2 x float> @test_tailcall_flipped(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: ld.param.b64 %rd1, [test_tailcall_flipped_param_0]; ; CHECK-NEXT: { // callseq 2, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: .param .align 8 .b8 param1[8]; -; CHECK-NEXT: st.param.b64 [param1], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-NEXT: st.param.b64 [param1], %rd1; +; CHECK-NEXT: st.param.b64 [param0], %rd2; ; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); ; CHECK-NEXT: ld.param.b64 %rd3, [retval0]; ; CHECK-NEXT: } // callseq 2 diff --git a/llvm/test/CodeGen/NVPTX/fma.ll b/llvm/test/CodeGen/NVPTX/fma.ll index 5aa12b0..87274aa 100644 --- a/llvm/test/CodeGen/NVPTX/fma.ll +++ b/llvm/test/CodeGen/NVPTX/fma.ll @@ -36,10 +36,10 @@ define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) { ; CHECK-NEXT: fma.rn.f32 %r6, %r1, %r2, %r5; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b32 param0; -; CHECK-NEXT: st.param.b32 [param0], %r4; ; CHECK-NEXT: .param .b32 param1; -; CHECK-NEXT: st.param.b32 [param1], %r6; ; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: st.param.b32 [param1], %r6; +; CHECK-NEXT: st.param.b32 [param0], %r4; ; CHECK-NEXT: call.uni (retval0), dummy_f32, (param0, param1); ; CHECK-NEXT: ld.param.b32 %r7, [retval0]; ; CHECK-NEXT: } // callseq 0 @@ -83,10 +83,10 @@ define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { ; CHECK-NEXT: fma.rn.f64 %rd6, %rd1, %rd2, %rd5; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0], %rd4; ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1], %rd6; ; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: st.param.b64 [param1], %rd6; +; CHECK-NEXT: st.param.b64 [param0], %rd4; ; CHECK-NEXT: call.uni (retval0), dummy_f64, (param0, param1); ; CHECK-NEXT: ld.param.b64 %rd7, [retval0]; ; CHECK-NEXT: } // callseq 1 diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll index ed8f6b4..636e12b 100644 --- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll +++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll @@ -64,9 +64,9 @@ define void @test_ld_param_byval(ptr byval(i32) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_byval_param_0]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; +; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_byval_param_0]; ; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni byval_user, (param0); ; CHECK-NEXT: } // callseq 1 diff --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll index 4f4c2fe..79abca0 100644 --- a/llvm/test/CodeGen/NVPTX/i128-param.ll +++ b/llvm/test/CodeGen/NVPTX/i128-param.ll @@ -29,11 +29,11 @@ start: ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1]; ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0 - ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK-NEXT: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]} - ; CHECK: .param .align 16 .b8 param1[16]; - ; CHECK-NEXT: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]} - ; CHECK: } // callseq [[CALLSEQ_ID]] + ; CHECK-DAG: .param .align 16 .b8 param0[16]; + ; CHECK-DAG: .param .align 16 .b8 param1[16]; + ; CHECK-DAG: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]} + ; CHECK-DAG: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]} + ; CHECK: } // callseq [[CALLSEQ_ID]] call void @callee(i128 %0, i128 %1, ptr %2) ret void @@ -48,11 +48,11 @@ start: ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1] ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0 - ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]} - ; CHECK: .param .align 16 .b8 param1[16]; - ; CHECK: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]} - ; CHECK: } // callseq [[CALLSEQ_ID]] + ; CHECK-DAG: .param .align 16 .b8 param0[16]; + ; CHECK-DAG: .param .align 16 .b8 param1[16]; + ; CHECK-DAG: st.param.v2.b64 [param0], {%[[REG0]], %[[REG1]]} + ; CHECK-DAG: st.param.v2.b64 [param1], {%[[REG2]], %[[REG3]]} + ; CHECK: } // callseq [[CALLSEQ_ID]] call void @callee(i128 %0, i128 %1, ptr %2) ret void diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 2b7a06c..74136bb 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -642,10 +642,10 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: ld.param.b32 %r1, [test_call_param_0]; ; COMMON-NEXT: { // callseq 0, 0 ; COMMON-NEXT: .param .align 4 .b8 param0[4]; -; COMMON-NEXT: st.param.b32 [param0], %r1; ; COMMON-NEXT: .param .align 4 .b8 param1[4]; -; COMMON-NEXT: st.param.b32 [param1], %r2; ; COMMON-NEXT: .param .align 4 .b8 retval0[4]; +; COMMON-NEXT: st.param.b32 [param1], %r2; +; COMMON-NEXT: st.param.b32 [param0], %r1; ; COMMON-NEXT: call.uni (retval0), test_callee, (param0, param1); ; COMMON-NEXT: ld.param.b32 %r3, [retval0]; ; COMMON-NEXT: } // callseq 0 @@ -665,10 +665,10 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; ; COMMON-NEXT: { // callseq 1, 0 ; COMMON-NEXT: .param .align 4 .b8 param0[4]; -; COMMON-NEXT: st.param.b32 [param0], %r2; ; COMMON-NEXT: .param .align 4 .b8 param1[4]; -; COMMON-NEXT: st.param.b32 [param1], %r1; ; COMMON-NEXT: .param .align 4 .b8 retval0[4]; +; COMMON-NEXT: st.param.b32 [param1], %r1; +; COMMON-NEXT: st.param.b32 [param0], %r2; ; COMMON-NEXT: call.uni (retval0), test_callee, (param0, param1); ; COMMON-NEXT: ld.param.b32 %r3, [retval0]; ; COMMON-NEXT: } // callseq 1 @@ -688,10 +688,10 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; ; COMMON-NEXT: { // callseq 2, 0 ; COMMON-NEXT: .param .align 4 .b8 param0[4]; -; COMMON-NEXT: st.param.b32 [param0], %r2; ; COMMON-NEXT: .param .align 4 .b8 param1[4]; -; COMMON-NEXT: st.param.b32 [param1], %r1; ; COMMON-NEXT: .param .align 4 .b8 retval0[4]; +; COMMON-NEXT: st.param.b32 [param1], %r1; +; COMMON-NEXT: st.param.b32 [param0], %r2; ; COMMON-NEXT: call.uni (retval0), test_callee, (param0, param1); ; COMMON-NEXT: ld.param.b32 %r3, [retval0]; ; COMMON-NEXT: } // callseq 2 diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll index 3edd4e4..98f94bb 100644 --- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -1,42 +1,107 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ -; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: | FileCheck %s -; RUN: %if ptxas %{ \ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: | %ptxas-verify -arch=sm_90 \ +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs -O0 | FileCheck %s --check-prefixes=O0,COMMON +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=O3,COMMON +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs -O0 \ +; RUN: | %ptxas-verify -arch=sm_90 \ +; RUN: %} +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_90 \ ; RUN: %} +target triple = "nvptx64-nvidia-cuda" target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) { -; CHECK-LABEL: test_bitcast_2xi8_i16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_bitcast_2xi8_i16_param_0]; -; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; -; CHECK-NEXT: shl.b16 %rs3, %rs2, 8; -; CHECK-NEXT: or.b16 %rs4, %rs1, %rs3; -; CHECK-NEXT: cvt.u32.u16 %r2, %rs4; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; -; CHECK-NEXT: ret; +; O0-LABEL: test_bitcast_2xi8_i16( +; O0: { +; O0-NEXT: .reg .b16 %rs<5>; +; O0-NEXT: .reg .b32 %r<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_bitcast_2xi8_i16_param_0]; +; O0-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; O0-NEXT: shl.b16 %rs3, %rs2, 8; +; O0-NEXT: or.b16 %rs4, %rs1, %rs3; +; O0-NEXT: cvt.u32.u16 %r2, %rs4; +; O0-NEXT: st.param.b32 [func_retval0], %r2; +; O0-NEXT: ret; +; +; O3-LABEL: test_bitcast_2xi8_i16( +; O3: { +; O3-NEXT: .reg .b32 %r<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b16 %r1, [test_bitcast_2xi8_i16_param_0]; +; O3-NEXT: st.param.b32 [func_retval0], %r1; +; O3-NEXT: ret; %res = bitcast <2 x i8> %a to i16 ret i16 %res } define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) { -; CHECK-LABEL: test_bitcast_i16_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0]; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; -; CHECK-NEXT: ret; +; O0-LABEL: test_bitcast_i16_2xi8( +; O0: { +; O0-NEXT: .reg .b16 %rs<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0]; +; O0-NEXT: st.param.b16 [func_retval0], %rs1; +; O0-NEXT: ret; +; +; O3-LABEL: test_bitcast_i16_2xi8( +; O3: { +; O3-NEXT: .reg .b16 %rs<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0]; +; O3-NEXT: st.param.b16 [func_retval0], %rs1; +; O3-NEXT: ret; %res = bitcast i16 %a to <2 x i8> ret <2 x i8> %res } + +define <2 x i8> @test_call_2xi8(<2 x i8> %a) { +; O0-LABEL: test_call_2xi8( +; O0: { +; O0-NEXT: .reg .b16 %rs<7>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0]; +; O0-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; O0-NEXT: { // callseq 0, 0 +; O0-NEXT: .param .align 2 .b8 param0[2]; +; O0-NEXT: .param .align 2 .b8 retval0[2]; +; O0-NEXT: st.param.v2.b8 [param0], {%rs1, %rs2}; +; O0-NEXT: call.uni (retval0), test_call_2xi8, (param0); +; O0-NEXT: ld.param.v2.b8 {%rs3, %rs4}, [retval0]; +; O0-NEXT: } // callseq 0 +; O0-NEXT: st.param.v2.b8 [func_retval0], {%rs3, %rs4}; +; O0-NEXT: ret; +; +; O3-LABEL: test_call_2xi8( +; O3: { +; O3-NEXT: .reg .b16 %rs<7>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_call_2xi8_param_0]; +; O3-NEXT: { // callseq 0, 0 +; O3-NEXT: .param .align 2 .b8 param0[2]; +; O3-NEXT: .param .align 2 .b8 retval0[2]; +; O3-NEXT: st.param.v2.b8 [param0], {%rs1, %rs2}; +; O3-NEXT: call.uni (retval0), test_call_2xi8, (param0); +; O3-NEXT: ld.param.v2.b8 {%rs3, %rs4}, [retval0]; +; O3-NEXT: } // callseq 0 +; O3-NEXT: st.param.v2.b8 [func_retval0], {%rs3, %rs4}; +; O3-NEXT: ret; + %res = call <2 x i8> @test_call_2xi8(<2 x i8> %a) + ret <2 x i8> %res +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; COMMON: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index da99cec..06c2cc8 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1273,10 +1273,10 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-NEXT: ld.param.b32 %r1, [test_call_param_0]; ; O0-NEXT: { // callseq 0, 0 ; O0-NEXT: .param .align 4 .b8 param0[4]; -; O0-NEXT: st.param.b32 [param0], %r1; ; O0-NEXT: .param .align 4 .b8 param1[4]; -; O0-NEXT: st.param.b32 [param1], %r2; ; O0-NEXT: .param .align 4 .b8 retval0[4]; +; O0-NEXT: st.param.b32 [param1], %r2; +; O0-NEXT: st.param.b32 [param0], %r1; ; O0-NEXT: call.uni (retval0), test_callee, (param0, param1); ; O0-NEXT: ld.param.b32 %r3, [retval0]; ; O0-NEXT: } // callseq 0 @@ -1289,13 +1289,13 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_call_param_0]; -; O3-NEXT: ld.param.b32 %r2, [test_call_param_1]; ; O3-NEXT: { // callseq 0, 0 ; O3-NEXT: .param .align 4 .b8 param0[4]; -; O3-NEXT: st.param.b32 [param0], %r1; ; O3-NEXT: .param .align 4 .b8 param1[4]; -; O3-NEXT: st.param.b32 [param1], %r2; ; O3-NEXT: .param .align 4 .b8 retval0[4]; +; O3-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; O3-NEXT: st.param.b32 [param1], %r2; +; O3-NEXT: st.param.b32 [param0], %r1; ; O3-NEXT: call.uni (retval0), test_callee, (param0, param1); ; O3-NEXT: ld.param.b32 %r3, [retval0]; ; O3-NEXT: } // callseq 0 @@ -1315,10 +1315,10 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; ; O0-NEXT: { // callseq 1, 0 ; O0-NEXT: .param .align 4 .b8 param0[4]; -; O0-NEXT: st.param.b32 [param0], %r2; ; O0-NEXT: .param .align 4 .b8 param1[4]; -; O0-NEXT: st.param.b32 [param1], %r1; ; O0-NEXT: .param .align 4 .b8 retval0[4]; +; O0-NEXT: st.param.b32 [param1], %r1; +; O0-NEXT: st.param.b32 [param0], %r2; ; O0-NEXT: call.uni (retval0), test_callee, (param0, param1); ; O0-NEXT: ld.param.b32 %r3, [retval0]; ; O0-NEXT: } // callseq 1 @@ -1331,13 +1331,13 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; -; O3-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; ; O3-NEXT: { // callseq 1, 0 ; O3-NEXT: .param .align 4 .b8 param0[4]; -; O3-NEXT: st.param.b32 [param0], %r2; ; O3-NEXT: .param .align 4 .b8 param1[4]; -; O3-NEXT: st.param.b32 [param1], %r1; ; O3-NEXT: .param .align 4 .b8 retval0[4]; +; O3-NEXT: st.param.b32 [param1], %r1; +; O3-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; +; O3-NEXT: st.param.b32 [param0], %r2; ; O3-NEXT: call.uni (retval0), test_callee, (param0, param1); ; O3-NEXT: ld.param.b32 %r3, [retval0]; ; O3-NEXT: } // callseq 1 @@ -1357,10 +1357,10 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; ; O0-NEXT: { // callseq 2, 0 ; O0-NEXT: .param .align 4 .b8 param0[4]; -; O0-NEXT: st.param.b32 [param0], %r2; ; O0-NEXT: .param .align 4 .b8 param1[4]; -; O0-NEXT: st.param.b32 [param1], %r1; ; O0-NEXT: .param .align 4 .b8 retval0[4]; +; O0-NEXT: st.param.b32 [param1], %r1; +; O0-NEXT: st.param.b32 [param0], %r2; ; O0-NEXT: call.uni (retval0), test_callee, (param0, param1); ; O0-NEXT: ld.param.b32 %r3, [retval0]; ; O0-NEXT: } // callseq 2 @@ -1373,13 +1373,13 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; -; O3-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; ; O3-NEXT: { // callseq 2, 0 ; O3-NEXT: .param .align 4 .b8 param0[4]; -; O3-NEXT: st.param.b32 [param0], %r2; ; O3-NEXT: .param .align 4 .b8 param1[4]; -; O3-NEXT: st.param.b32 [param1], %r1; ; O3-NEXT: .param .align 4 .b8 retval0[4]; +; O3-NEXT: st.param.b32 [param1], %r1; +; O3-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; +; O3-NEXT: st.param.b32 [param0], %r2; ; O3-NEXT: call.uni (retval0), test_callee, (param0, param1); ; O3-NEXT: ld.param.b32 %r3, [retval0]; ; O3-NEXT: } // callseq 2 diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll index be84f9b..a3bf892 100644 --- a/llvm/test/CodeGen/NVPTX/idioms.ll +++ b/llvm/test/CodeGen/NVPTX/idioms.ll @@ -173,8 +173,8 @@ define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){ ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: shr.s32 %r2, %r1, 16; ; CHECK-NEXT: shr.u32 %r3, %r2, 16; -; CHECK-NEXT: st.param.b16 [func_retval0], %r2; ; CHECK-NEXT: st.param.b16 [func_retval0+2], %r3; +; CHECK-NEXT: st.param.b16 [func_retval0], %r2; ; CHECK-NEXT: ret; call void @escape_int(i32 %i); // Force %i to be loaded completely. %i1 = ashr i32 %i, 16 diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll index eae0321..782e672 100644 --- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -23,15 +23,15 @@ define internal i32 @foo() { ; CHECK-NEXT: mov.b64 %SPL, __local_depot0; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.global.b64 %rd1, [ptr]; -; CHECK-NEXT: add.u64 %rd3, %SPL, 1; -; CHECK-NEXT: ld.local.b8 %rs1, [%rd3]; -; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 1 .b8 param0[1]; -; CHECK-NEXT: st.param.b8 [param0], %rs1; ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1], %rd4; ; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: add.u64 %rd2, %SP, 0; +; CHECK-NEXT: st.param.b64 [param1], %rd2; +; CHECK-NEXT: add.u64 %rd4, %SPL, 1; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd4]; +; CHECK-NEXT: st.param.b8 [param0], %rs1; ; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _); ; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_0; ; CHECK-NEXT: ld.param.b32 %r1, [retval0]; @@ -60,15 +60,15 @@ define internal i32 @bar() { ; CHECK-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-NEXT: ld.global.b64 %rd1, [ptr]; -; CHECK-NEXT: add.u64 %rd3, %SPL, 8; -; CHECK-NEXT: ld.local.b64 %rd4, [%rd3]; -; CHECK-NEXT: add.u64 %rd5, %SP, 0; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.b64 [param0], %rd4; ; CHECK-NEXT: .param .b64 param1; -; CHECK-NEXT: st.param.b64 [param1], %rd5; ; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: add.u64 %rd2, %SP, 0; +; CHECK-NEXT: st.param.b64 [param1], %rd2; +; CHECK-NEXT: add.u64 %rd4, %SPL, 8; +; CHECK-NEXT: ld.local.b64 %rd5, [%rd4]; +; CHECK-NEXT: st.param.b64 [param0], %rd5; ; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _); ; CHECK-NEXT: call (retval0), %rd1, (param0, param1), prototype_1; ; CHECK-NEXT: ld.param.b32 %r1, [retval0]; diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 321a624..38185c7b 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -121,20 +121,18 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { -; PTX-NEXT: .reg .b32 %r<2>; ; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd2, grid_const_escape_param_0; ; PTX-NEXT: cvta.param.u64 %rd3, %rd2; -; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 0, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd3; ; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: st.param.b64 [param0], %rd3; ; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _); +; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: call (retval0), %rd1, (param0), prototype_0; -; PTX-NEXT: ld.param.b32 %r1, [retval0]; ; PTX-NEXT: } // callseq 0 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_escape( @@ -153,7 +151,7 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: .local .align 4 .b8 __local_depot4[4]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; -; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b32 %r<2>; ; PTX-NEXT: .reg .b64 %rd<8>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: @@ -167,18 +165,17 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: add.u64 %rd6, %SP, 0; ; PTX-NEXT: add.u64 %rd7, %SPL, 0; ; PTX-NEXT: st.local.b32 [%rd7], %r1; -; PTX-NEXT: mov.b64 %rd1, escape3; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: .param .b64 param1; -; PTX-NEXT: st.param.b64 [param1], %rd6; ; PTX-NEXT: .param .b64 param2; -; PTX-NEXT: st.param.b64 [param2], %rd4; ; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: st.param.b64 [param2], %rd4; +; PTX-NEXT: st.param.b64 [param1], %rd6; +; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _); +; PTX-NEXT: mov.b64 %rd1, escape3; ; PTX-NEXT: call (retval0), %rd1, (param0, param1, param2), prototype_1; -; PTX-NEXT: ld.param.b32 %r2, [retval0]; ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape( @@ -255,7 +252,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escape( ; PTX: { -; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: @@ -266,14 +263,13 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escape_param_0]; ; PTX-NEXT: add.s32 %r2, %r1, %r1; ; PTX-NEXT: st.global.b32 [%rd4], %r2; -; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _); +; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: call (retval0), %rd1, (param0), prototype_2; -; PTX-NEXT: ld.param.b32 %r3, [retval0]; ; PTX-NEXT: } // callseq 2 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( @@ -295,7 +291,7 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { -; PTX-NEXT: .reg .b32 %r<5>; +; PTX-NEXT: .reg .b32 %r<4>; ; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: @@ -307,14 +303,13 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-NEXT: ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4]; ; PTX-NEXT: st.global.b64 [%rd4], %rd5; ; PTX-NEXT: add.s32 %r3, %r1, %r2; -; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 3, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _); +; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: call (retval0), %rd1, (param0), prototype_3; -; PTX-NEXT: ld.param.b32 %r4, [retval0]; ; PTX-NEXT: } // callseq 3 ; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; @@ -535,9 +530,9 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { ; PTX-NEXT: .reg .b32 %r<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0]; ; PTX-NEXT: { // callseq 4, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; +; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0]; ; PTX-NEXT: st.param.b32 [param0], %r1; ; PTX-NEXT: call.uni device_func, (param0); ; PTX-NEXT: } // callseq 4 diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index c165de7..7c029ab 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -31,7 +31,7 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 % ; PTX-LABEL: load_alignment( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; -; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0; @@ -45,10 +45,9 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 % ; PTX-NEXT: st.b32 [%rd3], %r3; ; PTX-NEXT: { // callseq 0, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: .param .b64 retval0; +; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: call.uni (retval0), escape, (param0); -; PTX-NEXT: ld.param.b64 %rd6, [retval0]; ; PTX-NEXT: } // callseq 0 ; PTX-NEXT: ret; entry: @@ -76,17 +75,16 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { ; ; PTX-LABEL: load_padding( ; PTX: { -; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, load_padding_param_0; ; PTX-NEXT: cvta.local.u64 %rd2, %rd1; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd2; ; PTX-NEXT: .param .b64 retval0; +; PTX-NEXT: st.param.b64 [param0], %rd2; ; PTX-NEXT: call.uni (retval0), escape, (param0); -; PTX-NEXT: ld.param.b64 %rd3, [retval0]; ; PTX-NEXT: } // callseq 1 ; PTX-NEXT: ret; %tmp = call ptr @escape(ptr nonnull align 16 %arg) diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 4784d70..20a3519 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -911,9 +911,9 @@ define void @device_func(ptr byval(i32) align 4 %input) { ; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b32 %r1, [device_func_param_0]; ; PTX-NEXT: { // callseq 3, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; +; PTX-NEXT: ld.param.b32 %r1, [device_func_param_0]; ; PTX-NEXT: st.param.b32 [param0], %r1; ; PTX-NEXT: call.uni device_func, (param0); ; PTX-NEXT: } // callseq 3 diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index 8401f45..b2994c0 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -8,7 +8,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-LABEL: wombat( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<11>; -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb ; CHECK-NEXT: ld.param.b32 %r4, [wombat_param_2]; @@ -19,19 +19,18 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0], 0d0000000000000000; ; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: st.param.b64 [param0], 0; ; CHECK-NEXT: call.uni (retval0), quux, (param0); -; CHECK-NEXT: ld.param.b64 %rd1, [retval0]; ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; ; CHECK-NEXT: or.b32 %r8, %r4, %r7; ; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8; -; CHECK-NEXT: cvt.rn.f64.s32 %rd2, %r9; -; CHECK-NEXT: cvt.rn.f64.u32 %rd3, %r10; -; CHECK-NEXT: add.rn.f64 %rd4, %rd3, %rd2; -; CHECK-NEXT: mov.b64 %rd5, 0; -; CHECK-NEXT: st.global.b64 [%rd5], %rd4; +; CHECK-NEXT: cvt.rn.f64.s32 %rd1, %r9; +; CHECK-NEXT: cvt.rn.f64.u32 %rd2, %r10; +; CHECK-NEXT: add.rn.f64 %rd3, %rd2, %rd1; +; CHECK-NEXT: mov.b64 %rd4, 0; +; CHECK-NEXT: st.global.b64 [%rd4], %rd3; ; CHECK-NEXT: mov.b32 %r10, 1; ; CHECK-NEXT: bra.uni $L__BB0_1; bb: diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll index 4fa1235..c5ea9f8 100644 --- a/llvm/test/CodeGen/NVPTX/param-add.ll +++ b/llvm/test/CodeGen/NVPTX/param-add.ll @@ -18,16 +18,16 @@ define i32 @test(%struct.1float alignstack(32) %data) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_param_0]; -; CHECK-NEXT: shr.u32 %r2, %r1, 8; -; CHECK-NEXT: shr.u32 %r3, %r1, 16; -; CHECK-NEXT: shr.u32 %r4, %r1, 24; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 1 .b8 param0[4]; +; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: st.param.b8 [param0], %r1; +; CHECK-NEXT: shr.u32 %r2, %r1, 8; ; CHECK-NEXT: st.param.b8 [param0+1], %r2; +; CHECK-NEXT: shr.u32 %r3, %r1, 16; ; CHECK-NEXT: st.param.b8 [param0+2], %r3; +; CHECK-NEXT: shr.u32 %r4, %r3, 8; ; CHECK-NEXT: st.param.b8 [param0+3], %r4; -; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: call.uni (retval0), callee, (param0); ; CHECK-NEXT: ld.param.b32 %r5, [retval0]; ; CHECK-NEXT: } // callseq 0 diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index 6c52bfd..db3fbbc 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -27,10 +27,10 @@ ; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1_param_0]; ; CHECK: and.b16 [[A:%rs[0-9]+]], [[A8]], 1; ; CHECK: setp.ne.b16 %p1, [[A]], 0 +; CHECK-DAG: .param .b32 param0; +; CHECK-DAG: .param .b32 retval0; ; CHECK: cvt.u32.u16 [[B:%r[0-9]+]], [[A8]] -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[B]] -; CHECK: .param .b32 retval0; +; CHECK-DAG: st.param.b32 [param0], [[B]] ; CHECK: call.uni (retval0), test_i1, ; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R8]]; @@ -47,11 +47,11 @@ define i1 @test_i1(i1 %a) { ; CHECK-NEXT: .param .b32 test_i1s_param_0 ; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; ; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: .param .b32 param0; +; CHECK: .param .b32 retval0; ; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; ; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; -; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], [[A]]; -; CHECK: .param .b32 retval0; ; CHECK: call.uni ; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0]; ; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; @@ -70,9 +70,9 @@ define signext i1 @test_i1s(i1 signext %a) { ; CHECK-DAG: ld.param.b8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; ; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v3i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK-DAG: st.param.b8 [param0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; -; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), test_v3i1, ; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; @@ -89,8 +89,8 @@ define <3 x i1> @test_v3i1(<3 x i1> %a) { ; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1] ; CHECK: ld.param.b8 [[E0:%rs[0-9]+]], [test_v4i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0], [[E0]]; ; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: st.param.b8 [param0], [[E0]]; ; CHECK: call.uni (retval0), test_v4i1, ; CHECK: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; ; CHECK: ld.param.b8 [[RE1:%rs[0-9]+]], [retval0+1]; @@ -112,9 +112,9 @@ define <4 x i1> @test_v4i1(<4 x i1> %a) { ; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; ; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v5i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK-DAG: st.param.b8 [param0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 1 .b8 retval0[1]; ; CHECK: call.uni (retval0), test_v5i1, ; CHECK-DAG: ld.param.b8 [[RE0:%rs[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; @@ -131,8 +131,8 @@ define <5 x i1> @test_v5i1(<5 x i1> %a) { ; CHECK-NEXT: .param .b32 test_i2_param_0 ; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i2_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), test_i2, ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; ; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; @@ -147,8 +147,8 @@ define i2 @test_i2(i2 %a) { ; CHECK-NEXT: .param .b32 test_i3_param_0 ; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i3_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), test_i3, ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; ; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; @@ -163,10 +163,10 @@ define i3 @test_i3(i3 %a) { ; CHECK-LABEL: test_i8( ; CHECK-NEXT: .param .b32 test_i8_param_0 ; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i8_param_0]; -; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[A32]]; ; CHECK: .param .b32 retval0; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: st.param.b32 [param0], [[A32]]; ; CHECK: call.uni (retval0), test_i8, ; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R32]]; @@ -181,10 +181,10 @@ define i8 @test_i8(i8 %a) { ; CHECK-LABEL: test_i8s( ; CHECK-NEXT: .param .b32 test_i8s_param_0 ; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; -; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[A]]; ; CHECK: .param .b32 retval0; +; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; +; CHECK: st.param.b32 [param0], [[A]]; ; CHECK: call.uni (retval0), test_i8s, ; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0]; ; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? @@ -202,8 +202,8 @@ define signext i8 @test_i8s(i8 signext %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v3i8_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[R]] ; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: st.param.b32 [param0], [[R]] ; CHECK: call.uni (retval0), test_v3i8, ; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0]; ; v4i8/i32->{v3i8 elements}->v4i8/i32 conversion is messy and not very @@ -220,8 +220,8 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v4i8_param_0] ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[R]]; ; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: st.param.b32 [param0], [[R]]; ; CHECK: call.uni (retval0), test_v4i8, ; CHECK: ld.param.b32 [[RET:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[RET]]; @@ -237,20 +237,13 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) { ; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v5i8_param_0] ; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0], -; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; ; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK-DAG: st.param.b32 [param0], [[E0]]; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; ; CHECK: call.uni (retval0), test_v5i8, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0]; +; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: cvt.u32.u16 [[R3:%r[0-9]+]], [[RE3]]; -; CHECK-DAG: cvt.u32.u16 [[R2:%r[0-9]+]], [[RE2]]; -; CHECK-DAG: prmt.b32 [[P0:%r[0-9]+]], [[R2]], [[R3]], 0x3340U; -; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RE1]]; -; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RE0]]; -; CHECK-DAG: prmt.b32 [[P1:%r[0-9]+]], [[R0]], [[R1]], 0x3340U; -; CHECK-DAG: prmt.b32 [[P2:%r[0-9]+]], [[P1]], [[P0]], 0x5410U; -; CHECK-DAG: st.param.b32 [func_retval0], [[P2]]; +; CHECK-DAG: st.param.b32 [func_retval0], [[RE0]]; ; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; ; CHECK-NEXT: ret; define <5 x i8> @test_v5i8(<5 x i8> %a) { @@ -262,8 +255,8 @@ define <5 x i8> @test_v5i8(<5 x i8> %a) { ; CHECK-LABEL: test_i11( ; CHECK-NEXT: .param .b32 test_i11_param_0 ; CHECK: ld.param.b16 {{%rs[0-9]+}}, [test_i11_param_0]; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), test_i11, ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; ; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; @@ -277,10 +270,10 @@ define i11 @test_i11(i11 %a) { ; CHECK-LABEL: test_i16( ; CHECK-NEXT: .param .b32 test_i16_param_0 ; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16_param_0]; -; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E32]]; ; CHECK: .param .b32 retval0; +; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: st.param.b32 [param0], [[E32]]; ; CHECK: call.uni (retval0), test_i16, ; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[RE32]]; @@ -294,10 +287,10 @@ define i16 @test_i16(i16 %a) { ; CHECK-LABEL: test_i16s( ; CHECK-NEXT: .param .b32 test_i16s_param_0 ; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; -; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E32]]; ; CHECK: .param .b32 retval0; +; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: st.param.b32 [param0], [[E32]]; ; CHECK: call.uni (retval0), test_i16s, ; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0]; ; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; @@ -312,14 +305,15 @@ define signext i16 @test_i16s(i16 signext %a) { ; CHECK-LABEL: test_v3i16( ; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] ; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; -; CHECK-DAG: ld.param.v2.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; +; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v3i16_param_0]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b16 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b16 [param0+4], [[E2]]; ; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK-DAG: st.param.b32 [param0], [[E0]]; +; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; ; CHECK: call.uni (retval0), test_v3i16, -; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0]; +; CHECK: ld.param.b32 [[RE:%r[0-9]+]], [retval0]; ; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: mov.b32 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [[RE]]; ; CHECK-DAG: st.param.v2.b16 [func_retval0], {[[RE0]], [[RE1]]}; ; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; ; CHECK-NEXT: ret; @@ -333,8 +327,8 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) { ; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] ; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0] ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: call.uni (retval0), test_v4i16, ; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]} @@ -348,15 +342,15 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) { ; CHECK-LABEL: test_v5i16( ; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] ; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; -; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v5i16_param_0] ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; ; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; ; CHECK: call.uni (retval0), test_v5i16, -; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0]; +; CHECK-DAG: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[RE0]], [[RE1]]} ; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; ; CHECK-NEXT: ret; define <5 x i16> @test_v5i16(<5 x i16> %a) { @@ -369,8 +363,8 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) { ; CHECK-NEXT: .param .align 2 .b8 test_f16_param_0[2] ; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_f16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[E]]; ; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: st.param.b16 [param0], [[E]]; ; CHECK: call.uni (retval0), test_f16, ; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; ; CHECK: st.param.b16 [func_retval0], [[R]] @@ -385,8 +379,8 @@ define half @test_f16(half %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2f16_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: call.uni (retval0), test_v2f16, ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R]] @@ -401,8 +395,8 @@ define <2 x half> @test_v2f16(<2 x half> %a) { ; CHECK-NEXT: .param .align 2 .b8 test_bf16_param_0[2] ; CHECK: ld.param.b16 [[E:%rs[0-9]+]], [test_bf16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[E]]; ; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: st.param.b16 [param0], [[E]]; ; CHECK: call.uni (retval0), test_bf16, ; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; ; CHECK: st.param.b16 [func_retval0], [[R]] @@ -417,8 +411,8 @@ define bfloat @test_bf16(bfloat %a) { ; CHECK-NEXT: .param .align 4 .b8 test_v2bf16_param_0[4] ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_v2bf16_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: call.uni (retval0), test_v2bf16, ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R]] @@ -432,15 +426,16 @@ define <2 x bfloat> @test_v2bf16(<2 x bfloat> %a) { ; CHECK:.func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_v3f16( ; CHECK: .param .align 8 .b8 test_v3f16_param_0[8] -; CHECK-DAG: ld.param.v2.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3f16_param_0]; +; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v3f16_param_0]; ; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3f16_param_0+4]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v2.b16 [param0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; ; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK-DAG: st.param.b32 [param0], [[E0]]; +; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; ; CHECK: call.uni (retval0), test_v3f16, -; CHECK-DAG: ld.param.v2.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [retval0]; +; CHECK-DAG: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b16 [[R2:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: mov.b32 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]]}, [[R]]; ; CHECK-DAG: st.param.v2.b16 [func_retval0], {[[R0]], [[R1]]}; ; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; ; CHECK: ret; @@ -454,8 +449,8 @@ define <3 x half> @test_v3f16(<3 x half> %a) { ; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] ; CHECK: ld.param.v2.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0], {[[R01]], [[R23]]}; ; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: st.param.v2.b32 [param0], {[[R01]], [[R23]]}; ; CHECK: call.uni (retval0), test_v4f16, ; CHECK: ld.param.v2.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]]}, [retval0]; ; CHECK: st.param.v2.b32 [func_retval0], {[[RH01]], [[RH23]]}; @@ -468,16 +463,16 @@ define <4 x half> @test_v4f16(<4 x half> %a) { ; CHECK:.func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: test_v5f16( ; CHECK: .param .align 16 .b8 test_v5f16_param_0[16] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5f16_param_0]; +; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v5f16_param_0]; ; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5f16_param_0+8]; ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0], -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; ; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; ; CHECK: call.uni (retval0), test_v5f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0]; +; CHECK-DAG: ld.param.v2.b32 {[[R0:%r[0-9]+]], [[R1:%r[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b16 [[R4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}; ; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; ; CHECK: ret; define <5 x half> @test_v5f16(<5 x half> %a) { @@ -490,8 +485,8 @@ define <5 x half> @test_v5f16(<5 x half> %a) { ; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] ; CHECK: ld.param.v4.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; ; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]}; ; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]}; ; CHECK: call.uni (retval0), test_v8f16, ; CHECK: ld.param.v4.b32 {[[RH01:%r[0-9]+]], [[RH23:%r[0-9]+]], [[RH45:%r[0-9]+]], [[RH67:%r[0-9]+]]}, [retval0]; ; CHECK: st.param.v4.b32 [func_retval0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; @@ -504,20 +499,20 @@ define <8 x half> @test_v8f16(<8 x half> %a) { ; CHECK:.func (.param .align 32 .b8 func_retval0[32]) ; CHECK-LABEL: test_v9f16( ; CHECK: .param .align 32 .b8 test_v9f16_param_0[32] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v9f16_param_0]; -; CHECK-DAG: ld.param.v4.b16 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [test_v9f16_param_0+8]; +; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v9f16_param_0]; +; CHECK-DAG: ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v9f16_param_0+8]; ; CHECK-DAG: ld.param.b16 [[E8:%rs[0-9]+]], [test_v9f16_param_0+16]; ; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b16 [param0], -; CHECK-DAG: st.param.v4.b16 [param0+8], -; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; ; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; +; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; ; CHECK: call.uni (retval0), test_v9f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%rs[0-9]+]], [[R1:%rs[0-9]+]], [[R2:%rs[0-9]+]], [[R3:%rs[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.v4.b16 {[[R4:%rs[0-9]+]], [[R5:%rs[0-9]+]], [[R6:%rs[0-9]+]], [[R7:%rs[0-9]+]]}, [retval0+8]; +; CHECK-DAG: ld.param.v2.b32 {[[R0:%r[0-9]+]], [[R1:%r[0-9]+]]}, [retval0]; +; CHECK-DAG: ld.param.v2.b32 {[[R2:%r[0-9]+]], [[R3:%r[0-9]+]]}, [retval0+8]; ; CHECK-DAG: ld.param.b16 [[R8:%rs[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b16 [func_retval0], {[[R0]], [[R1]], [[R2]], [[R3]]}; -; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R0]], [[R1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[R2]], [[R3]]}; ; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; ; CHECK: ret; define <9 x half> @test_v9f16(<9 x half> %a) { @@ -531,8 +526,8 @@ define <9 x half> @test_v9f16(<9 x half> %a) { ; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0]; ; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), test_i19, ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; ; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; @@ -548,8 +543,8 @@ define i19 @test_i19(i19 %a) { ; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0]; ; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), test_i23, ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; ; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; @@ -565,8 +560,8 @@ define i23 @test_i23(i23 %a) { ; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2]; ; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), test_i24, ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; ; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; @@ -581,8 +576,8 @@ define i24 @test_i24(i24 %a) { ; CHECK-NEXT: .param .b32 test_i29_param_0 ; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i29_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), test_i29, ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; ; CHECK: st.param.b32 [func_retval0], {{%r[0-9]+}}; @@ -597,8 +592,8 @@ define i29 @test_i29(i29 %a) { ; CHECK-NEXT: .param .b32 test_i32_param_0 ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_i32_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: call.uni (retval0), test_i32, ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R]]; @@ -613,10 +608,10 @@ define i32 @test_i32(i32 %a) { ; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] ; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; ; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: .param .align 16 .b8 param0[16]; +; CHECK-DAG: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; ; CHECK: call.uni (retval0), test_v3i32, ; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; @@ -632,9 +627,9 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) { ; CHECK-LABEL: test_v4i32( ; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] ; CHECK: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: .param .align 16 .b8 param0[16]; +; CHECK-DAG: .param .align 16 .b8 retval0[16]; +; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK: call.uni (retval0), test_v4i32, ; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0]; ; CHECK: st.param.v4.b32 [func_retval0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} @@ -650,9 +645,9 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) { ; CHECK-DAG: ld.param.b32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; ; CHECK-DAG: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] ; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: .param .align 32 .b8 retval0[32]; ; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; -; CHECK: .param .align 32 .b8 retval0[32]; ; CHECK: call.uni (retval0), test_v5i32, ; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0]; ; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; @@ -669,8 +664,8 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) { ; CHECK-NEXT: .param .b32 test_f32_param_0 ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_f32_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .b32 retval0; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: call.uni (retval0), test_f32, ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R]]; @@ -686,8 +681,8 @@ define float @test_f32(float %a) { ; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4]; ; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: call.uni (retval0), test_i40, ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; ; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; @@ -703,8 +698,8 @@ define i40 @test_i40(i40 %a) { ; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4]; ; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: call.uni (retval0), test_i47, ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; ; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; @@ -720,8 +715,8 @@ define i47 @test_i47(i47 %a) { ; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4]; ; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: call.uni (retval0), test_i48, ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; ; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; @@ -738,8 +733,8 @@ define i48 @test_i48(i48 %a) { ; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4]; ; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: call.uni (retval0), test_i51, ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; ; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; @@ -756,8 +751,8 @@ define i51 @test_i51(i51 %a) { ; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4]; ; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: call.uni (retval0), test_i56, ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; ; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; @@ -772,8 +767,8 @@ define i56 @test_i56(i56 %a) { ; CHECK-NEXT: .param .b64 test_i57_param_0 ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i57_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; +; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: call.uni (retval0), test_i57, ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; ; CHECK: st.param.b64 [func_retval0], {{%rd[0-9]+}}; @@ -788,8 +783,8 @@ define i57 @test_i57(i57 %a) { ; CHECK-NEXT: .param .b64 test_i64_param_0 ; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_i64_param_0]; ; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: .param .b64 retval0; +; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: call.uni (retval0), test_i64, ; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0]; ; CHECK: st.param.b64 [func_retval0], [[R]]; @@ -805,9 +800,9 @@ define i64 @test_i64(i64 %a) { ; CHECK-DAG: ld.param.b64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; ; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; ; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b64 [param0+16], [[E2]]; ; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK-DAG: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b64 [param0+16], [[E2]]; ; CHECK: call.uni (retval0), test_v3i64, ; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0]; ; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; @@ -828,9 +823,9 @@ define <3 x i64> @test_v3i64(<3 x i64> %a) { ; CHECK-DAG: ld.param.v2.b64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; ; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; ; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; ; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK-DAG: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; ; CHECK: call.uni (retval0), test_v4i64, ; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0]; ; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; @@ -849,8 +844,8 @@ define <4 x i64> @test_v4i64(<4 x i64> %a) { ; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] ; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0], [[A]] ; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: st.param.b8 [param0], [[A]] ; CHECK: call.uni (retval0), test_s_i1, ; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0]; ; CHECK: st.param.b8 [func_retval0], [[R]]; @@ -865,8 +860,8 @@ define %s_i1 @test_s_i1(%s_i1 %a) { ; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] ; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; ; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0], [[A]] ; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: st.param.b8 [param0], [[A]] ; CHECK: call.uni (retval0), test_s_i8, ; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0]; ; CHECK: st.param.b8 [func_retval0], [[R]]; @@ -881,8 +876,8 @@ define %s_i8 @test_s_i8(%s_i8 %a) { ; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[A]] ; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: st.param.b16 [param0], [[A]] ; CHECK: call.uni (retval0), test_s_i16, ; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; ; CHECK: st.param.b16 [func_retval0], [[R]]; @@ -897,8 +892,8 @@ define %s_i16 @test_s_i16(%s_i16 %a) { ; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_f16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0], [[A]] ; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: st.param.b16 [param0], [[A]] ; CHECK: call.uni (retval0), test_s_f16, ; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0]; ; CHECK: st.param.b16 [func_retval0], [[R]]; @@ -913,8 +908,8 @@ define %s_f16 @test_s_f16(%s_f16 %a) { ; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_s_i32_param_0]; ; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: call.uni (retval0), test_s_i32, ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R]]; @@ -929,8 +924,8 @@ define %s_i32 @test_s_i32(%s_i32 %a) { ; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] ; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_s_f32_param_0]; ; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: call.uni (retval0), test_s_f32, ; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0]; ; CHECK: st.param.b32 [func_retval0], [[R]]; @@ -945,8 +940,8 @@ define %s_f32 @test_s_f32(%s_f32 %a) { ; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] ; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; ; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: call.uni (retval0), test_s_i64, ; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0]; ; CHECK: st.param.b64 [func_retval0], [[R]]; @@ -966,12 +961,12 @@ define %s_i64 @test_s_i64(%s_i64 %a) { ; CHECK-DAG: ld.param.b32 [[E1:%r[0-9]+]], [test_s_i32f32_param_0+4]; ; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; ; CHECK: .param .align 8 .b8 param0[24]; +; CHECK: .param .align 8 .b8 retval0[24]; ; CHECK-DAG: st.param.b32 [param0], [[E0]]; ; CHECK-DAG: st.param.b32 [param0+4], [[E1]]; ; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; ; CHECK-DAG: st.param.b32 [param0+12], [[E3]]; ; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[24]; ; CHECK: call.uni (retval0), test_s_i32f32, ; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b32 [[RE1:%r[0-9]+]], [retval0+4]; @@ -997,10 +992,10 @@ define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { ; CHECK-DAG: ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; ; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; ; CHECK: .param .align 8 .b8 param0[24]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; -; CHECK: st.param.b64 [param0+16], [[E4]]; ; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; +; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; ; CHECK: call.uni (retval0), test_s_i32x4, ; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; @@ -1024,16 +1019,13 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { ; CHECK: ld.param.b8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; ; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; ; CHECK: .param .align 8 .b8 param0[32]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b8 [param0+8], [[E2]]; -; CHECK: st.param.b32 [param0+12], [[E3]]; -; CHECK: st.param.b32 [param0+16], [[E4]]; -; CHECK: st.param.b64 [param0+24], [[E5]]; ; CHECK: .param .align 8 .b8 retval0[32]; -; CHECK: call.uni (retval0), test_s_i1i32x4, -; CHECK: ( -; CHECK: param0 -; CHECK: ); +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b8 [param0+8], [[E2]]; +; CHECK-DAG: st.param.b32 [param0+12], [[E3]]; +; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; +; CHECK-DAG: st.param.b64 [param0+24], [[E5]]; +; CHECK: call.uni (retval0), test_s_i1i32x4, (param0); ; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; ; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; @@ -1082,6 +1074,7 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; ; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0]; ; CHECK: .param .align 1 .b8 param0[25]; +; CHECK: .param .align 1 .b8 retval0[25]; ; CHECK-DAG: st.param.b8 [param0], ; CHECK-DAG: st.param.b8 [param0+1], ; CHECK-DAG: st.param.b8 [param0+2], @@ -1107,33 +1100,32 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK-DAG: st.param.b8 [param0+22], ; CHECK-DAG: st.param.b8 [param0+23], ; CHECK-DAG: st.param.b8 [param0+24], -; CHECK: .param .align 1 .b8 retval0[25]; -; CHECK: call.uni (retval0), test_s_i1i32x4p, -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+1]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+2]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+3]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+4]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+5]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+6]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+7]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+9]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+10]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+11]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+12]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+13]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+14]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+15]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+16]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+17]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+18]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+19]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+20]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+21]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+22]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+23]; -; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+24]; +; CHECK: call.uni (retval0), test_s_i1i32x4p, (param0); +; CHECK-DAG: ld.param.b8 %rs{{[0-9]+}}, [retval0+8]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+3]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+2]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+1]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+7]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+6]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+5]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+4]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+12]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+11]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+10]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+9]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+16]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+15]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+14]; +; CHECK-DAG: ld.param.b8 %r{{[0-9]+}}, [retval0+13]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+24]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+23]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+22]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+21]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+20]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+19]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+18]; +; CHECK-DAG: ld.param.b8 %rd{{[0-9]+}}, [retval0+17]; ; CHECK: } // callseq ; CHECK-DAG: st.param.b8 [func_retval0], ; CHECK-DAG: st.param.b8 [func_retval0+1], @@ -1177,13 +1169,13 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { ; CHECK: ld.param.b32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; ; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; ; CHECK: .param .align 16 .b8 param0[80]; -; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; -; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; -; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; -; CHECK: st.param.b32 [param0+64], [[E15]]; ; CHECK: .param .align 16 .b8 retval0[80]; +; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK-DAG: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; +; CHECK-DAG: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; +; CHECK-DAG: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; +; CHECK-DAG: st.param.b32 [param0+64], [[E15]]; ; CHECK: call.uni (retval0), test_s_crossfield, ; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0]; ; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll index 88ad0b0..2155fb4 100644 --- a/llvm/test/CodeGen/NVPTX/param-overalign.ll +++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll @@ -28,8 +28,8 @@ define float @caller_md(float %a, float %b) { ; CHECK-NEXT: ld.param.b32 %r2, [caller_md_param_1]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: call.uni (retval0), callee_md, (param0); ; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 0 @@ -69,8 +69,8 @@ define float @caller(float %a, float %b) { ; CHECK-NEXT: ld.param.b32 %r2, [caller_param_1]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, %r2}; ; CHECK-NEXT: call.uni (retval0), callee, (param0); ; CHECK-NEXT: ld.param.b32 %r3, [retval0]; ; CHECK-NEXT: } // callseq 1 diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index a480984a..a592b82 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -84,8 +84,8 @@ define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x1_param_1 ; CHECK: ) ; CHECK: .param .b32 param0; - ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[4]; + ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: call.uni (retval0), callee_St4x1, (param0); ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0]; %1 = load i32, ptr %in, align 4 @@ -112,8 +112,8 @@ define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x2_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[8]; - ; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[8]; + ; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: call.uni (retval0), callee_St4x2, (param0); ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; %agg.tmp = alloca %struct.St4x2, align 8 @@ -149,9 +149,9 @@ define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x3_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[12]; + ; CHECK: .param .align 16 .b8 retval0[12]; ; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.b32 [param0+8], {{%r[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[12]; ; CHECK: call.uni (retval0), callee_St4x3, (param0); ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+8]; @@ -193,8 +193,8 @@ define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x4_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[16]; + ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: call.uni (retval0), callee_St4x4, (param0); ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; %call = tail call fastcc [4 x i32] @callee_St4x4(ptr noundef nonnull byval(%struct.St4x4) align 4 %in) #2 @@ -239,9 +239,9 @@ define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x5_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[20]; + ; CHECK: .param .align 16 .b8 retval0[20]; ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.b32 [param0+16], {{%r[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[20]; ; CHECK: call.uni (retval0), callee_St4x5, (param0); ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+16]; @@ -295,9 +295,9 @@ define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x6_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[24]; + ; CHECK: .param .align 16 .b8 retval0[24]; ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: .param .align 16 .b8 retval0[24]; ; CHECK: call.uni (retval0), callee_St4x6, (param0); ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; @@ -357,10 +357,10 @@ define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x7_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[28]; + ; CHECK: .param .align 16 .b8 retval0[28]; ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: st.param.b32 [param0+24], {{%r[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[28]; ; CHECK: call.uni (retval0), callee_St4x7, (param0); ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; @@ -429,9 +429,9 @@ define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St4x8_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[32]; - ; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; - ; CHECK: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[32]; + ; CHECK-DAG: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; + ; CHECK-DAG: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}; ; CHECK: call.uni (retval0), callee_St4x8, (param0); ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0]; ; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16]; @@ -503,8 +503,8 @@ define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x1_param_1 ; CHECK: ) ; CHECK: .param .b64 param0; - ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .align 16 .b8 retval0[8]; + ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: call.uni (retval0), callee_St8x1, (param0); ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0]; %1 = load i64, ptr %in, align 8 @@ -531,8 +531,8 @@ define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x2_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[16]; - ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[16]; + ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: call.uni (retval0), callee_St8x2, (param0); ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0]; %call = tail call fastcc [2 x i64] @callee_St8x2(ptr noundef nonnull byval(%struct.St8x2) align 8 %in) #2 @@ -565,9 +565,9 @@ define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x3_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[24]; + ; CHECK: .param .align 16 .b8 retval0[24]; ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: st.param.b64 [param0+16], {{%rd[0-9]+}}; - ; CHECK: .param .align 16 .b8 retval0[24]; ; CHECK: call.uni (retval0), callee_St8x3, (param0); ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0]; ; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+16]; @@ -609,9 +609,9 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK: .param .b64 caller_St8x4_param_1 ; CHECK: ) ; CHECK: .param .align 16 .b8 param0[32]; - ; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; - ; CHECK: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: .param .align 16 .b8 retval0[32]; + ; CHECK-DAG: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; + ; CHECK-DAG: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}; ; CHECK: call.uni (retval0), callee_St8x4, (param0); ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0]; ; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16]; diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir index 5d0d6f6..4a53152 100644 --- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir +++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir @@ -77,7 +77,7 @@ constants: [] machineFunctionInfo: {} body: | bb.0: - %0:b32, %1:b32, %2:b32, %3:b32 = LoadParamMemV4I32 0 + %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, &retval0, 0 :: (load (s128), addrspace 101) ; CHECK-NOT: ProxyReg %4:b32 = ProxyRegB32 killed %0 %5:b32 = ProxyRegB32 killed %1 @@ -86,7 +86,7 @@ body: | ; CHECK: STV_i32_v4 killed %0, killed %1, killed %2, killed %3 STV_i32_v4 killed %4, killed %5, killed %6, killed %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s128), addrspace 101) - %8:b32 = LoadParamMemI32 0 + %8:b32 = LD_i32 0, 0, 101, 3, 32, &retval0, 0 :: (load (s32), addrspace 101) ; CHECK-NOT: ProxyReg %9:b32 = ProxyRegB32 killed %8 %10:b32 = ProxyRegB32 killed %9 diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll index 6aa1119..f90435a 100644 --- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll +++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll @@ -26,8 +26,8 @@ define void @st_param_i8_i16() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[4]; -; CHECK-NEXT: st.param.b8 [param0], 1; ; CHECK-NEXT: st.param.b16 [param0+2], 2; +; CHECK-NEXT: st.param.b8 [param0], 1; ; CHECK-NEXT: call.uni call_i8_i16, (param0); ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: ret; @@ -75,7 +75,7 @@ define void @st_param_f32() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 3, 0 ; CHECK-NEXT: .param .b32 param0; -; CHECK-NEXT: st.param.b32 [param0], 0f40A00000; +; CHECK-NEXT: st.param.b32 [param0], 1084227584; ; CHECK-NEXT: call.uni call_f32, (param0); ; CHECK-NEXT: } // callseq 3 ; CHECK-NEXT: ret; @@ -91,7 +91,7 @@ define void @st_param_f64() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 4, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0], 0d4018000000000000; +; CHECK-NEXT: st.param.b64 [param0], 4618441417868443648; ; CHECK-NEXT: call.uni call_f64, (param0); ; CHECK-NEXT: } // callseq 4 ; CHECK-NEXT: ret; @@ -165,7 +165,7 @@ define void @st_param_v2_i16_ii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 8, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v2.b16 [param0], {1, 2}; +; CHECK-NEXT: st.param.b32 [param0], 131073; ; CHECK-NEXT: call.uni call_v2_i16, (param0); ; CHECK-NEXT: } // callseq 8 ; CHECK-NEXT: ret; @@ -432,7 +432,7 @@ define void @st_param_v4_i8_iiii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 23, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, 3, 4}; +; CHECK-NEXT: st.param.b32 [param0], 67305985; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 23 ; CHECK-NEXT: ret; @@ -442,15 +442,18 @@ define void @st_param_v4_i8_iiii() { define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) { ; CHECK-LABEL: st_param_v4_i8_irrr( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_2]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irrr_param_2]; +; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_irrr_param_1]; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_irrr_param_0]; +; CHECK-NEXT: prmt.b32 %r5, 1, %r4, 0x3340U; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U; ; CHECK-NEXT: { // callseq 24, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs3, %rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [param0], %r6; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 24 ; CHECK-NEXT: ret; @@ -464,15 +467,18 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) { define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) { ; CHECK-LABEL: st_param_v4_i8_rirr( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_2]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rirr_param_2]; +; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_rirr_param_1]; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_rirr_param_0]; +; CHECK-NEXT: prmt.b32 %r5, %r4, 2, 0x3340U; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U; ; CHECK-NEXT: { // callseq 25, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, 2, %rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [param0], %r6; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 25 ; CHECK-NEXT: ret; @@ -486,15 +492,18 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) { define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) { ; CHECK-LABEL: st_param_v4_i8_rrir( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_2]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rrir_param_1]; +; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_rrir_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_rrir_param_2]; +; CHECK-NEXT: prmt.b32 %r5, 3, %r4, 0x3340U; +; CHECK-NEXT: prmt.b32 %r6, %r3, %r5, 0x5410U; ; CHECK-NEXT: { // callseq 26, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, 3, %rs1}; +; CHECK-NEXT: st.param.b32 [param0], %r6; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 26 ; CHECK-NEXT: ret; @@ -508,15 +517,18 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) { define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) { ; CHECK-LABEL: st_param_v4_i8_rrri( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_2]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rrri_param_1]; +; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_rrri_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r4, [st_param_v4_i8_rrri_param_2]; +; CHECK-NEXT: prmt.b32 %r5, %r4, 4, 0x3340U; +; CHECK-NEXT: prmt.b32 %r6, %r3, %r5, 0x5410U; ; CHECK-NEXT: { // callseq 27, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs3, %rs2, %rs1, 4}; +; CHECK-NEXT: st.param.b32 [param0], %r6; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 27 ; CHECK-NEXT: ret; @@ -530,14 +542,16 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) { define void @st_param_v4_i8_iirr(i8 %c, i8 %d) { ; CHECK-LABEL: st_param_v4_i8_iirr( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_1]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_iirr_param_1]; +; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_iirr_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; CHECK-NEXT: prmt.b32 %r4, 513, %r3, 0x5410U; ; CHECK-NEXT: { // callseq 28, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [param0], %r4; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 28 ; CHECK-NEXT: ret; @@ -551,14 +565,17 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) { define void @st_param_v4_i8_irir(i8 %b, i8 %d) { ; CHECK-LABEL: st_param_v4_i8_irir( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irir_param_1]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irir_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irir_param_1]; +; CHECK-NEXT: prmt.b32 %r2, 3, %r1, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r3, [st_param_v4_i8_irir_param_0]; +; CHECK-NEXT: prmt.b32 %r4, 1, %r3, 0x3340U; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; ; CHECK-NEXT: { // callseq 29, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, 3, %rs1}; +; CHECK-NEXT: st.param.b32 [param0], %r5; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 29 ; CHECK-NEXT: ret; @@ -572,14 +589,17 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) { define void @st_param_v4_i8_irri(i8 %b, i8 %c) { ; CHECK-LABEL: st_param_v4_i8_irri( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irri_param_1]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irri_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irri_param_1]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 4, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r3, [st_param_v4_i8_irri_param_0]; +; CHECK-NEXT: prmt.b32 %r4, 1, %r3, 0x3340U; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; ; CHECK-NEXT: { // callseq 30, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs2, %rs1, 4}; +; CHECK-NEXT: st.param.b32 [param0], %r5; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 30 ; CHECK-NEXT: ret; @@ -593,14 +613,17 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) { define void @st_param_v4_i8_riir(i8 %a, i8 %d) { ; CHECK-LABEL: st_param_v4_i8_riir( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riir_param_1]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riir_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_riir_param_1]; +; CHECK-NEXT: prmt.b32 %r2, 3, %r1, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r3, [st_param_v4_i8_riir_param_0]; +; CHECK-NEXT: prmt.b32 %r4, %r3, 2, 0x3340U; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; ; CHECK-NEXT: { // callseq 31, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, 3, %rs1}; +; CHECK-NEXT: st.param.b32 [param0], %r5; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 31 ; CHECK-NEXT: ret; @@ -614,14 +637,17 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) { define void @st_param_v4_i8_riri(i8 %a, i8 %c) { ; CHECK-LABEL: st_param_v4_i8_riri( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riri_param_1]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riri_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_riri_param_1]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 4, 0x3340U; +; CHECK-NEXT: ld.param.b8 %r3, [st_param_v4_i8_riri_param_0]; +; CHECK-NEXT: prmt.b32 %r4, %r3, 2, 0x3340U; +; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; ; CHECK-NEXT: { // callseq 32, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, 2, %rs1, 4}; +; CHECK-NEXT: st.param.b32 [param0], %r5; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 32 ; CHECK-NEXT: ret; @@ -635,14 +661,16 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) { define void @st_param_v4_i8_rrii(i8 %a, i8 %b) { ; CHECK-LABEL: st_param_v4_i8_rrii( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_1]; -; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_0]; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_rrii_param_1]; +; CHECK-NEXT: ld.param.b8 %r2, [st_param_v4_i8_rrii_param_0]; +; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; CHECK-NEXT: prmt.b32 %r4, %r3, 1027, 0x5410U; ; CHECK-NEXT: { // callseq 33, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs2, %rs1, 3, 4}; +; CHECK-NEXT: st.param.b32 [param0], %r4; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 33 ; CHECK-NEXT: ret; @@ -656,13 +684,15 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) { define void @st_param_v4_i8_iiir(i8 %d) { ; CHECK-LABEL: st_param_v4_i8_iiir( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iiir_param_0]; ; CHECK-NEXT: { // callseq 34, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, 3, %rs1}; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_iiir_param_0]; +; CHECK-NEXT: prmt.b32 %r2, 3, %r1, 0x3340U; +; CHECK-NEXT: prmt.b32 %r3, 513, %r2, 0x5410U; +; CHECK-NEXT: st.param.b32 [param0], %r3; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 34 ; CHECK-NEXT: ret; @@ -676,13 +706,15 @@ define void @st_param_v4_i8_iiir(i8 %d) { define void @st_param_v4_i8_iiri(i8 %c) { ; CHECK-LABEL: st_param_v4_i8_iiri( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iiri_param_0]; ; CHECK-NEXT: { // callseq 35, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, 4}; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_iiri_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 4, 0x3340U; +; CHECK-NEXT: prmt.b32 %r3, 513, %r2, 0x5410U; +; CHECK-NEXT: st.param.b32 [param0], %r3; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 35 ; CHECK-NEXT: ret; @@ -696,13 +728,15 @@ define void @st_param_v4_i8_iiri(i8 %c) { define void @st_param_v4_i8_irii(i8 %b) { ; CHECK-LABEL: st_param_v4_i8_irii( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irii_param_0]; ; CHECK-NEXT: { // callseq 36, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, 4}; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_irii_param_0]; +; CHECK-NEXT: prmt.b32 %r2, 1, %r1, 0x3340U; +; CHECK-NEXT: prmt.b32 %r3, %r2, 1027, 0x5410U; +; CHECK-NEXT: st.param.b32 [param0], %r3; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 36 ; CHECK-NEXT: ret; @@ -716,13 +750,15 @@ define void @st_param_v4_i8_irii(i8 %b) { define void @st_param_v4_i8_riii(i8 %a) { ; CHECK-LABEL: st_param_v4_i8_riii( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riii_param_0]; ; CHECK-NEXT: { // callseq 37, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, 4}; +; CHECK-NEXT: ld.param.b8 %r1, [st_param_v4_i8_riii_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 2, 0x3340U; +; CHECK-NEXT: prmt.b32 %r3, %r2, 1027, 0x5410U; +; CHECK-NEXT: st.param.b32 [param0], %r3; ; CHECK-NEXT: call.uni call_v4_i8, (param0); ; CHECK-NEXT: } // callseq 37 ; CHECK-NEXT: ret; @@ -742,7 +778,7 @@ define void @st_param_v4_i16_iiii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 38, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, 3, 4}; +; CHECK-NEXT: st.param.v2.b32 [param0], {131073, 262147}; ; CHECK-NEXT: call.uni call_v4_i16, (param0); ; CHECK-NEXT: } // callseq 38 ; CHECK-NEXT: ret; @@ -841,13 +877,15 @@ define void @st_param_v4_i16_iirr(i16 %c, i16 %d) { ; CHECK-LABEL: st_param_v4_i16_iirr( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iirr_param_0]; ; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_iirr_param_1]; +; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; CHECK-NEXT: { // callseq 43, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, %rs2}; +; CHECK-NEXT: st.param.v2.b32 [param0], {131073, %r1}; ; CHECK-NEXT: call.uni call_v4_i16, (param0); ; CHECK-NEXT: } // callseq 43 ; CHECK-NEXT: ret; @@ -946,13 +984,15 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) { ; CHECK-LABEL: st_param_v4_i16_rrii( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_rrii_param_0]; ; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_rrii_param_1]; +; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; CHECK-NEXT: { // callseq 48, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, 3, 4}; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 262147}; ; CHECK-NEXT: call.uni call_v4_i16, (param0); ; CHECK-NEXT: } // callseq 48 ; CHECK-NEXT: ret; @@ -966,13 +1006,16 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) { define void @st_param_v4_i16_iiir(i16 %d) { ; CHECK-LABEL: st_param_v4_i16_iiir( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iiir_param_0]; +; CHECK-NEXT: mov.b16 %rs2, 3; +; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; ; CHECK-NEXT: { // callseq 49, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, 3, %rs1}; +; CHECK-NEXT: st.param.v2.b32 [param0], {131073, %r1}; ; CHECK-NEXT: call.uni call_v4_i16, (param0); ; CHECK-NEXT: } // callseq 49 ; CHECK-NEXT: ret; @@ -986,13 +1029,16 @@ define void @st_param_v4_i16_iiir(i16 %d) { define void @st_param_v4_i16_iiri(i16 %c) { ; CHECK-LABEL: st_param_v4_i16_iiri( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iiri_param_0]; +; CHECK-NEXT: mov.b16 %rs2, 4; +; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; CHECK-NEXT: { // callseq 50, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, 4}; +; CHECK-NEXT: st.param.v2.b32 [param0], {131073, %r1}; ; CHECK-NEXT: call.uni call_v4_i16, (param0); ; CHECK-NEXT: } // callseq 50 ; CHECK-NEXT: ret; @@ -1006,13 +1052,16 @@ define void @st_param_v4_i16_iiri(i16 %c) { define void @st_param_v4_i16_irii(i16 %b) { ; CHECK-LABEL: st_param_v4_i16_irii( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_irii_param_0]; +; CHECK-NEXT: mov.b16 %rs2, 1; +; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; ; CHECK-NEXT: { // callseq 51, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, 3, 4}; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 262147}; ; CHECK-NEXT: call.uni call_v4_i16, (param0); ; CHECK-NEXT: } // callseq 51 ; CHECK-NEXT: ret; @@ -1026,13 +1075,16 @@ define void @st_param_v4_i16_irii(i16 %b) { define void @st_param_v4_i16_riii(i16 %a) { ; CHECK-LABEL: st_param_v4_i16_riii( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_riii_param_0]; +; CHECK-NEXT: mov.b16 %rs2, 2; +; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; CHECK-NEXT: { // callseq 52, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, 3, 4}; +; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 262147}; ; CHECK-NEXT: call.uni call_v4_i16, (param0); ; CHECK-NEXT: } // callseq 52 ; CHECK-NEXT: ret; @@ -1672,13 +1724,12 @@ declare void @call_v4_f32(%struct.float4 alignstack(16)) define void @st_param_bfloat() { ; CHECK-LABEL: st_param_bfloat( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-EMPTY: ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov.b16 %rs1, 0x4100; ; CHECK-NEXT: { // callseq 83, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[2]; -; CHECK-NEXT: st.param.b16 [param0], %rs1; +; CHECK-NEXT: st.param.b16 [param0], 0x4100; ; CHECK-NEXT: call.uni call_bfloat, (param0); ; CHECK-NEXT: } // callseq 83 ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll index 5b31b5e..c8ca6b6 100644 --- a/llvm/test/CodeGen/NVPTX/store-undef.ll +++ b/llvm/test/CodeGen/NVPTX/store-undef.ll @@ -34,9 +34,9 @@ define void @test_store_param_def(i64 %param0, i32 %param1) { ; CHECK-NEXT: ld.param.b32 %r1, [test_store_param_def_param_1]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[32]; +; CHECK-NEXT: st.param.v4.b32 [param0+16], {%r2, %r1, %r3, %r4}; +; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r5, %r1}; ; CHECK-NEXT: st.param.b64 [param0], %rd1; -; CHECK-NEXT: st.param.v2.b32 [param0+8], {%r2, %r1}; -; CHECK-NEXT: st.param.v4.b32 [param0+16], {%r3, %r1, %r4, %r5}; ; CHECK-NEXT: call.uni test_call, (param0); ; CHECK-NEXT: } // callseq 1 ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll index d6961a9..3138d7c 100644 --- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -69,8 +69,8 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) { ; CHECK-NEXT: tex.1d.v4.f32.s32 {%r2, %r3, %r4, %r5}, [tex0, {%r1}]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0], %rd3; ; CHECK-NEXT: .param .b32 retval0; +; CHECK-NEXT: st.param.b64 [param0], %rd3; ; CHECK-NEXT: call.uni (retval0), texfunc, (param0); ; CHECK-NEXT: ld.param.b32 %r6, [retval0]; ; CHECK-NEXT: } // callseq 0 diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll index 87e46b1..697eb90 100644 --- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; Verifies correctness of load/store of parameters and return values. -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %} +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | %ptxas-verify %} %s_i8i16p = type { <{ i16, i8, i16 }>, i64 } %s_i8i32p = type { <{ i32, i8, i32 }>, i64 } @@ -24,37 +24,35 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK-LABEL: test_s_i8i16p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<15>; +; CHECK-NEXT: .reg .b16 %rs<13>; +; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs4, [test_s_i8i16p_param_0+4]; -; CHECK-NEXT: shl.b16 %rs5, %rs4, 8; -; CHECK-NEXT: ld.param.b8 %rs6, [test_s_i8i16p_param_0+3]; -; CHECK-NEXT: or.b16 %rs3, %rs5, %rs6; +; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i16p_param_0]; ; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i16p_param_0+8]; -; CHECK-NEXT: ld.param.b8 %rs2, [test_s_i8i16p_param_0+2]; -; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8i16p_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8i16p_param_0+4]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[16]; -; CHECK-NEXT: st.param.b16 [param0], %rs1; -; CHECK-NEXT: st.param.b8 [param0+2], %rs2; -; CHECK-NEXT: st.param.b8 [param0+3], %rs3; -; CHECK-NEXT: st.param.b8 [param0+4], %rs4; -; CHECK-NEXT: st.param.b64 [param0+8], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[16]; +; CHECK-NEXT: st.param.b8 [param0+4], %rs1; +; CHECK-NEXT: st.param.b64 [param0+8], %rd1; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), test_s_i8i16p, (param0); -; CHECK-NEXT: ld.param.b16 %rs7, [retval0]; -; CHECK-NEXT: ld.param.b8 %rs8, [retval0+2]; -; CHECK-NEXT: ld.param.b8 %rs9, [retval0+3]; -; CHECK-NEXT: ld.param.b8 %rs10, [retval0+4]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+8]; +; CHECK-NEXT: ld.param.b8 %rs2, [retval0+2]; +; CHECK-NEXT: ld.param.b16 %rs3, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs4, [retval0+4]; +; CHECK-NEXT: ld.param.b8 %rs5, [retval0+3]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; -; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs8; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs10; -; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs9; +; CHECK-NEXT: shl.b16 %rs8, %rs4, 8; +; CHECK-NEXT: or.b16 %rs9, %rs8, %rs5; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs5; ; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: shr.u16 %rs12, %rs9, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs12; ; CHECK-NEXT: ret; %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) ret %s_i8i16p %r @@ -64,56 +62,51 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK-LABEL: test_s_i8i32p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<12>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<24>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8i32p_param_0+6]; -; CHECK-NEXT: shl.b32 %r4, %r3, 8; -; CHECK-NEXT: ld.param.b8 %r5, [test_s_i8i32p_param_0+5]; -; CHECK-NEXT: or.b32 %r6, %r4, %r5; -; CHECK-NEXT: ld.param.b8 %r7, [test_s_i8i32p_param_0+7]; -; CHECK-NEXT: shl.b32 %r8, %r7, 16; -; CHECK-NEXT: ld.param.b8 %r9, [test_s_i8i32p_param_0+8]; -; CHECK-NEXT: shl.b32 %r10, %r9, 24; -; CHECK-NEXT: or.b32 %r11, %r10, %r8; -; CHECK-NEXT: or.b32 %r2, %r11, %r6; -; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i32p_param_0+16]; -; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8i32p_param_0+4]; ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8i32p_param_0]; -; CHECK-NEXT: shr.u32 %r12, %r2, 8; -; CHECK-NEXT: shr.u32 %r13, %r11, 16; +; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8i32p_param_0+4]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i32p_param_0+16]; +; CHECK-NEXT: ld.param.b8 %r2, [test_s_i8i32p_param_0+6]; +; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8i32p_param_0+7]; +; CHECK-NEXT: ld.param.b8 %r4, [test_s_i8i32p_param_0+8]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[24]; -; CHECK-NEXT: st.param.b32 [param0], %r1; -; CHECK-NEXT: st.param.b8 [param0+4], %rs1; -; CHECK-NEXT: st.param.b8 [param0+5], %r2; -; CHECK-NEXT: st.param.b8 [param0+6], %r12; -; CHECK-NEXT: st.param.b8 [param0+7], %r13; -; CHECK-NEXT: st.param.b8 [param0+8], %r9; -; CHECK-NEXT: st.param.b64 [param0+16], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: st.param.b8 [param0+8], %r4; +; CHECK-NEXT: st.param.b8 [param0+7], %r3; +; CHECK-NEXT: st.param.b8 [param0+6], %r2; +; CHECK-NEXT: st.param.b64 [param0+16], %rd1; +; CHECK-NEXT: st.param.b16 [param0+4], %rs1; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), test_s_i8i32p, (param0); -; CHECK-NEXT: ld.param.b32 %r14, [retval0]; -; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4]; -; CHECK-NEXT: ld.param.b8 %rs3, [retval0+5]; -; CHECK-NEXT: ld.param.b8 %rs4, [retval0+6]; -; CHECK-NEXT: ld.param.b8 %rs5, [retval0+7]; -; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; +; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r5, [retval0]; +; CHECK-NEXT: ld.param.b8 %r6, [retval0+8]; +; CHECK-NEXT: ld.param.b8 %r7, [retval0+7]; +; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; +; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: cvt.u32.u16 %r15, %rs3; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r18, %rs6; -; CHECK-NEXT: st.param.b32 [func_retval0], %r14; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15; +; CHECK-NEXT: shl.b32 %r12, %r8, 8; +; CHECK-NEXT: or.b32 %r13, %r12, %r9; +; CHECK-NEXT: shl.b32 %r15, %r7, 16; +; CHECK-NEXT: shl.b32 %r17, %r6, 24; +; CHECK-NEXT: or.b32 %r18, %r17, %r15; +; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: shr.u32 %r21, %r19, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; +; CHECK-NEXT: shr.u32 %r22, %r19, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; +; CHECK-NEXT: shr.u32 %r23, %r19, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; ; CHECK-NEXT: ret; %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) ret %s_i8i32p %r @@ -123,112 +116,66 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-LABEL: test_s_i8i64p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b64 %rd<68>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<46>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8i64p_param_0+10]; -; CHECK-NEXT: shl.b64 %rd5, %rd4, 8; -; CHECK-NEXT: ld.param.b8 %rd6, [test_s_i8i64p_param_0+9]; -; CHECK-NEXT: or.b64 %rd7, %rd5, %rd6; -; CHECK-NEXT: ld.param.b8 %rd8, [test_s_i8i64p_param_0+11]; -; CHECK-NEXT: shl.b64 %rd9, %rd8, 16; -; CHECK-NEXT: ld.param.b8 %rd10, [test_s_i8i64p_param_0+12]; -; CHECK-NEXT: shl.b64 %rd11, %rd10, 24; -; CHECK-NEXT: or.b64 %rd12, %rd11, %rd9; -; CHECK-NEXT: or.b64 %rd13, %rd12, %rd7; -; CHECK-NEXT: ld.param.b8 %rd14, [test_s_i8i64p_param_0+14]; -; CHECK-NEXT: shl.b64 %rd15, %rd14, 8; -; CHECK-NEXT: ld.param.b8 %rd16, [test_s_i8i64p_param_0+13]; -; CHECK-NEXT: or.b64 %rd17, %rd15, %rd16; -; CHECK-NEXT: ld.param.b8 %rd18, [test_s_i8i64p_param_0+15]; -; CHECK-NEXT: shl.b64 %rd19, %rd18, 16; -; CHECK-NEXT: ld.param.b8 %rd20, [test_s_i8i64p_param_0+16]; -; CHECK-NEXT: shl.b64 %rd21, %rd20, 24; -; CHECK-NEXT: or.b64 %rd22, %rd21, %rd19; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd17; -; CHECK-NEXT: shl.b64 %rd24, %rd23, 32; -; CHECK-NEXT: or.b64 %rd2, %rd24, %rd13; -; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8i64p_param_0+24]; -; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8i64p_param_0+8]; ; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8i64p_param_0]; -; CHECK-NEXT: shr.u64 %rd25, %rd2, 8; -; CHECK-NEXT: shr.u64 %rd26, %rd2, 16; -; CHECK-NEXT: shr.u64 %rd27, %rd2, 24; -; CHECK-NEXT: bfe.u64 %rd28, %rd23, 8, 24; -; CHECK-NEXT: bfe.u64 %rd29, %rd23, 16, 16; -; CHECK-NEXT: bfe.u64 %rd30, %rd23, 24, 8; +; CHECK-NEXT: ld.param.b64 %rd2, [test_s_i8i64p_param_0+8]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8i64p_param_0+24]; +; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8i64p_param_0+16]; ; CHECK-NEXT: { // callseq 2, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[32]; -; CHECK-NEXT: st.param.b64 [param0], %rd1; -; CHECK-NEXT: st.param.b8 [param0+8], %rs1; -; CHECK-NEXT: st.param.b8 [param0+9], %rd2; -; CHECK-NEXT: st.param.b8 [param0+10], %rd25; -; CHECK-NEXT: st.param.b8 [param0+11], %rd26; -; CHECK-NEXT: st.param.b8 [param0+12], %rd27; -; CHECK-NEXT: st.param.b8 [param0+13], %rd23; -; CHECK-NEXT: st.param.b8 [param0+14], %rd28; -; CHECK-NEXT: st.param.b8 [param0+15], %rd29; -; CHECK-NEXT: st.param.b8 [param0+16], %rd30; -; CHECK-NEXT: st.param.b64 [param0+24], %rd3; ; CHECK-NEXT: .param .align 8 .b8 retval0[32]; +; CHECK-NEXT: st.param.b8 [param0+16], %rd4; +; CHECK-NEXT: st.param.b64 [param0+24], %rd3; +; CHECK-NEXT: st.param.b64 [param0+8], %rd2; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: call.uni (retval0), test_s_i8i64p, (param0); -; CHECK-NEXT: ld.param.b64 %rd31, [retval0]; -; CHECK-NEXT: ld.param.b8 %rs2, [retval0+8]; -; CHECK-NEXT: ld.param.b8 %rs3, [retval0+9]; -; CHECK-NEXT: ld.param.b8 %rs4, [retval0+10]; -; CHECK-NEXT: ld.param.b8 %rs5, [retval0+11]; -; CHECK-NEXT: ld.param.b8 %rs6, [retval0+12]; -; CHECK-NEXT: ld.param.b8 %rs7, [retval0+13]; -; CHECK-NEXT: ld.param.b8 %rs8, [retval0+14]; -; CHECK-NEXT: ld.param.b8 %rs9, [retval0+15]; -; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16]; -; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24]; +; CHECK-NEXT: ld.param.b64 %rd5, [retval0+24]; +; CHECK-NEXT: ld.param.b8 %rs1, [retval0+8]; +; CHECK-NEXT: ld.param.b64 %rd6, [retval0]; +; CHECK-NEXT: ld.param.b8 %rd7, [retval0+16]; +; CHECK-NEXT: ld.param.b8 %rd8, [retval0+15]; +; CHECK-NEXT: ld.param.b8 %rd9, [retval0+14]; +; CHECK-NEXT: ld.param.b8 %rd10, [retval0+13]; +; CHECK-NEXT: ld.param.b8 %rd11, [retval0+12]; +; CHECK-NEXT: ld.param.b8 %rd12, [retval0+11]; +; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10]; +; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9]; ; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3; -; CHECK-NEXT: and.b64 %rd34, %rd33, 255; -; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4; -; CHECK-NEXT: and.b64 %rd36, %rd35, 255; -; CHECK-NEXT: shl.b64 %rd37, %rd36, 8; -; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37; -; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5; -; CHECK-NEXT: and.b64 %rd40, %rd39, 255; -; CHECK-NEXT: shl.b64 %rd41, %rd40, 16; -; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41; -; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6; -; CHECK-NEXT: and.b64 %rd44, %rd43, 255; -; CHECK-NEXT: shl.b64 %rd45, %rd44, 24; -; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45; -; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7; -; CHECK-NEXT: and.b64 %rd48, %rd47, 255; -; CHECK-NEXT: shl.b64 %rd49, %rd48, 32; -; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49; -; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8; -; CHECK-NEXT: and.b64 %rd52, %rd51, 255; -; CHECK-NEXT: shl.b64 %rd53, %rd52, 40; -; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53; -; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9; -; CHECK-NEXT: and.b64 %rd56, %rd55, 255; -; CHECK-NEXT: shl.b64 %rd57, %rd56, 48; -; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57; -; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10; -; CHECK-NEXT: shl.b64 %rd60, %rd59, 56; -; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd31; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2; +; CHECK-NEXT: shl.b64 %rd17, %rd13, 8; +; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14; +; CHECK-NEXT: shl.b64 %rd20, %rd12, 16; +; CHECK-NEXT: shl.b64 %rd22, %rd11, 24; +; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18; +; CHECK-NEXT: shl.b64 %rd27, %rd9, 8; +; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10; +; CHECK-NEXT: shl.b64 %rd30, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd32, %rd7, 24; +; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30; +; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28; +; CHECK-NEXT: shl.b64 %rd35, %rd34, 32; +; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24; +; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14; +; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; +; CHECK-NEXT: shr.u64 %rd39, %rd36, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39; +; CHECK-NEXT: shr.u64 %rd40, %rd36, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40; +; CHECK-NEXT: shr.u64 %rd41, %rd36, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41; +; CHECK-NEXT: shr.u64 %rd42, %rd36, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42; +; CHECK-NEXT: shr.u64 %rd43, %rd36, 24; ; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; -; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33; -; CHECK-NEXT: shr.u64 %rd64, %rd50, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64; -; CHECK-NEXT: shr.u64 %rd65, %rd54, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65; -; CHECK-NEXT: shr.u64 %rd66, %rd58, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66; -; CHECK-NEXT: shr.u64 %rd67, %rd61, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67; -; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32; +; CHECK-NEXT: shr.u64 %rd44, %rd36, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44; +; CHECK-NEXT: shr.u64 %rd45, %rd36, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45; ; CHECK-NEXT: ret; %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) ret %s_i8i64p %r @@ -242,33 +189,32 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs4, [test_s_i8f16p_param_0+4]; -; CHECK-NEXT: shl.b16 %rs5, %rs4, 8; -; CHECK-NEXT: ld.param.b8 %rs6, [test_s_i8f16p_param_0+3]; -; CHECK-NEXT: or.b16 %rs3, %rs5, %rs6; -; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16p_param_0+8]; -; CHECK-NEXT: ld.param.b8 %rs2, [test_s_i8f16p_param_0+2]; ; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f16p_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [test_s_i8f16p_param_0+2]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16p_param_0+8]; +; CHECK-NEXT: ld.param.b8 %rs3, [test_s_i8f16p_param_0+4]; ; CHECK-NEXT: { // callseq 3, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[16]; -; CHECK-NEXT: st.param.b16 [param0], %rs1; -; CHECK-NEXT: st.param.b8 [param0+2], %rs2; -; CHECK-NEXT: st.param.b8 [param0+3], %rs3; -; CHECK-NEXT: st.param.b8 [param0+4], %rs4; -; CHECK-NEXT: st.param.b64 [param0+8], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[16]; +; CHECK-NEXT: st.param.b8 [param0+4], %rs3; +; CHECK-NEXT: st.param.b64 [param0+8], %rd1; +; CHECK-NEXT: st.param.b16 [param0+2], %rs2; +; CHECK-NEXT: st.param.b16 [param0], %rs1; ; CHECK-NEXT: call.uni (retval0), test_s_i8f16p, (param0); -; CHECK-NEXT: ld.param.b16 %rs7, [retval0]; -; CHECK-NEXT: ld.param.b8 %rs8, [retval0+2]; -; CHECK-NEXT: ld.param.b8 %rs9, [retval0+3]; -; CHECK-NEXT: ld.param.b8 %rs10, [retval0+4]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+8]; +; CHECK-NEXT: ld.param.b8 %rs4, [retval0+2]; +; CHECK-NEXT: ld.param.b16 %rs5, [retval0]; +; CHECK-NEXT: ld.param.b8 %rs6, [retval0+4]; +; CHECK-NEXT: ld.param.b8 %rs7, [retval0+3]; ; CHECK-NEXT: } // callseq 3 -; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; -; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs8; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs10; -; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs9; +; CHECK-NEXT: shl.b16 %rs10, %rs6, 8; +; CHECK-NEXT: or.b16 %rs11, %rs10, %rs7; +; CHECK-NEXT: st.param.b8 [func_retval0+3], %rs7; ; CHECK-NEXT: st.param.b64 [func_retval0+8], %rd2; +; CHECK-NEXT: st.param.b8 [func_retval0+2], %rs4; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs5; +; CHECK-NEXT: shr.u16 %rs14, %rs11, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs14; ; CHECK-NEXT: ret; %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) ret %s_i8f16p %r @@ -278,56 +224,51 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-LABEL: test_s_i8f16x2p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<12>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<24>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f16x2p_param_0+6]; -; CHECK-NEXT: shl.b32 %r4, %r3, 8; -; CHECK-NEXT: ld.param.b8 %r5, [test_s_i8f16x2p_param_0+5]; -; CHECK-NEXT: or.b32 %r6, %r4, %r5; -; CHECK-NEXT: ld.param.b8 %r7, [test_s_i8f16x2p_param_0+7]; -; CHECK-NEXT: shl.b32 %r8, %r7, 16; -; CHECK-NEXT: ld.param.b8 %r9, [test_s_i8f16x2p_param_0+8]; -; CHECK-NEXT: shl.b32 %r10, %r9, 24; -; CHECK-NEXT: or.b32 %r11, %r10, %r8; -; CHECK-NEXT: or.b32 %r2, %r11, %r6; -; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16x2p_param_0+16]; -; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8f16x2p_param_0+4]; ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f16x2p_param_0]; -; CHECK-NEXT: shr.u32 %r12, %r2, 8; -; CHECK-NEXT: shr.u32 %r13, %r11, 16; +; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f16x2p_param_0+4]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f16x2p_param_0+16]; +; CHECK-NEXT: ld.param.b8 %r2, [test_s_i8f16x2p_param_0+6]; +; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f16x2p_param_0+7]; +; CHECK-NEXT: ld.param.b8 %r4, [test_s_i8f16x2p_param_0+8]; ; CHECK-NEXT: { // callseq 4, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[24]; -; CHECK-NEXT: st.param.b32 [param0], %r1; -; CHECK-NEXT: st.param.b8 [param0+4], %rs1; -; CHECK-NEXT: st.param.b8 [param0+5], %r2; -; CHECK-NEXT: st.param.b8 [param0+6], %r12; -; CHECK-NEXT: st.param.b8 [param0+7], %r13; -; CHECK-NEXT: st.param.b8 [param0+8], %r9; -; CHECK-NEXT: st.param.b64 [param0+16], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: st.param.b8 [param0+8], %r4; +; CHECK-NEXT: st.param.b8 [param0+7], %r3; +; CHECK-NEXT: st.param.b8 [param0+6], %r2; +; CHECK-NEXT: st.param.b64 [param0+16], %rd1; +; CHECK-NEXT: st.param.b16 [param0+4], %rs1; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), test_s_i8f16x2p, (param0); -; CHECK-NEXT: ld.param.b32 %r14, [retval0]; -; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4]; -; CHECK-NEXT: ld.param.b8 %rs3, [retval0+5]; -; CHECK-NEXT: ld.param.b8 %rs4, [retval0+6]; -; CHECK-NEXT: ld.param.b8 %rs5, [retval0+7]; -; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; +; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r5, [retval0]; +; CHECK-NEXT: ld.param.b8 %r6, [retval0+8]; +; CHECK-NEXT: ld.param.b8 %r7, [retval0+7]; +; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; +; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 4 -; CHECK-NEXT: cvt.u32.u16 %r15, %rs3; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r18, %rs6; -; CHECK-NEXT: st.param.b32 [func_retval0], %r14; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15; +; CHECK-NEXT: shl.b32 %r12, %r8, 8; +; CHECK-NEXT: or.b32 %r13, %r12, %r9; +; CHECK-NEXT: shl.b32 %r15, %r7, 16; +; CHECK-NEXT: shl.b32 %r17, %r6, 24; +; CHECK-NEXT: or.b32 %r18, %r17, %r15; +; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: shr.u32 %r21, %r19, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; +; CHECK-NEXT: shr.u32 %r22, %r19, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; +; CHECK-NEXT: shr.u32 %r23, %r19, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; ; CHECK-NEXT: ret; %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) ret %s_i8f16x2p %r @@ -337,56 +278,51 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-LABEL: test_s_i8f32p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<12>; -; CHECK-NEXT: .reg .b32 %r<20>; +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<24>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f32p_param_0+6]; -; CHECK-NEXT: shl.b32 %r4, %r3, 8; -; CHECK-NEXT: ld.param.b8 %r5, [test_s_i8f32p_param_0+5]; -; CHECK-NEXT: or.b32 %r6, %r4, %r5; -; CHECK-NEXT: ld.param.b8 %r7, [test_s_i8f32p_param_0+7]; -; CHECK-NEXT: shl.b32 %r8, %r7, 16; -; CHECK-NEXT: ld.param.b8 %r9, [test_s_i8f32p_param_0+8]; -; CHECK-NEXT: shl.b32 %r10, %r9, 24; -; CHECK-NEXT: or.b32 %r11, %r10, %r8; -; CHECK-NEXT: or.b32 %r2, %r11, %r6; -; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f32p_param_0+16]; -; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8f32p_param_0+4]; ; CHECK-NEXT: ld.param.b32 %r1, [test_s_i8f32p_param_0]; -; CHECK-NEXT: shr.u32 %r12, %r2, 8; -; CHECK-NEXT: shr.u32 %r13, %r11, 16; +; CHECK-NEXT: ld.param.b16 %rs1, [test_s_i8f32p_param_0+4]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f32p_param_0+16]; +; CHECK-NEXT: ld.param.b8 %r2, [test_s_i8f32p_param_0+6]; +; CHECK-NEXT: ld.param.b8 %r3, [test_s_i8f32p_param_0+7]; +; CHECK-NEXT: ld.param.b8 %r4, [test_s_i8f32p_param_0+8]; ; CHECK-NEXT: { // callseq 5, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[24]; -; CHECK-NEXT: st.param.b32 [param0], %r1; -; CHECK-NEXT: st.param.b8 [param0+4], %rs1; -; CHECK-NEXT: st.param.b8 [param0+5], %r2; -; CHECK-NEXT: st.param.b8 [param0+6], %r12; -; CHECK-NEXT: st.param.b8 [param0+7], %r13; -; CHECK-NEXT: st.param.b8 [param0+8], %r9; -; CHECK-NEXT: st.param.b64 [param0+16], %rd1; ; CHECK-NEXT: .param .align 8 .b8 retval0[24]; +; CHECK-NEXT: st.param.b8 [param0+8], %r4; +; CHECK-NEXT: st.param.b8 [param0+7], %r3; +; CHECK-NEXT: st.param.b8 [param0+6], %r2; +; CHECK-NEXT: st.param.b64 [param0+16], %rd1; +; CHECK-NEXT: st.param.b16 [param0+4], %rs1; +; CHECK-NEXT: st.param.b32 [param0], %r1; ; CHECK-NEXT: call.uni (retval0), test_s_i8f32p, (param0); -; CHECK-NEXT: ld.param.b32 %r14, [retval0]; -; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4]; -; CHECK-NEXT: ld.param.b8 %rs3, [retval0+5]; -; CHECK-NEXT: ld.param.b8 %rs4, [retval0+6]; -; CHECK-NEXT: ld.param.b8 %rs5, [retval0+7]; -; CHECK-NEXT: ld.param.b8 %rs6, [retval0+8]; ; CHECK-NEXT: ld.param.b64 %rd2, [retval0+16]; +; CHECK-NEXT: ld.param.b8 %rs2, [retval0+4]; +; CHECK-NEXT: ld.param.b32 %r5, [retval0]; +; CHECK-NEXT: ld.param.b8 %r6, [retval0+8]; +; CHECK-NEXT: ld.param.b8 %r7, [retval0+7]; +; CHECK-NEXT: ld.param.b8 %r8, [retval0+6]; +; CHECK-NEXT: ld.param.b8 %r9, [retval0+5]; ; CHECK-NEXT: } // callseq 5 -; CHECK-NEXT: cvt.u32.u16 %r15, %rs3; -; CHECK-NEXT: cvt.u32.u16 %r16, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r17, %rs5; -; CHECK-NEXT: cvt.u32.u16 %r18, %rs6; -; CHECK-NEXT: st.param.b32 [func_retval0], %r14; -; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %r18; -; CHECK-NEXT: st.param.b8 [func_retval0+7], %r17; -; CHECK-NEXT: st.param.b8 [func_retval0+6], %r16; -; CHECK-NEXT: st.param.b8 [func_retval0+5], %r15; +; CHECK-NEXT: shl.b32 %r12, %r8, 8; +; CHECK-NEXT: or.b32 %r13, %r12, %r9; +; CHECK-NEXT: shl.b32 %r15, %r7, 16; +; CHECK-NEXT: shl.b32 %r17, %r6, 24; +; CHECK-NEXT: or.b32 %r18, %r17, %r15; +; CHECK-NEXT: or.b32 %r19, %r18, %r13; +; CHECK-NEXT: st.param.b8 [func_retval0+5], %r9; ; CHECK-NEXT: st.param.b64 [func_retval0+16], %rd2; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NEXT: shr.u32 %r21, %r19, 24; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %r21; +; CHECK-NEXT: shr.u32 %r22, %r19, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+7], %r22; +; CHECK-NEXT: shr.u32 %r23, %r19, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+6], %r23; ; CHECK-NEXT: ret; %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) ret %s_i8f32p %r @@ -396,112 +332,66 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) { ; CHECK-LABEL: test_s_i8f64p( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b64 %rd<68>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<46>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8f64p_param_0+10]; -; CHECK-NEXT: shl.b64 %rd5, %rd4, 8; -; CHECK-NEXT: ld.param.b8 %rd6, [test_s_i8f64p_param_0+9]; -; CHECK-NEXT: or.b64 %rd7, %rd5, %rd6; -; CHECK-NEXT: ld.param.b8 %rd8, [test_s_i8f64p_param_0+11]; -; CHECK-NEXT: shl.b64 %rd9, %rd8, 16; -; CHECK-NEXT: ld.param.b8 %rd10, [test_s_i8f64p_param_0+12]; -; CHECK-NEXT: shl.b64 %rd11, %rd10, 24; -; CHECK-NEXT: or.b64 %rd12, %rd11, %rd9; -; CHECK-NEXT: or.b64 %rd13, %rd12, %rd7; -; CHECK-NEXT: ld.param.b8 %rd14, [test_s_i8f64p_param_0+14]; -; CHECK-NEXT: shl.b64 %rd15, %rd14, 8; -; CHECK-NEXT: ld.param.b8 %rd16, [test_s_i8f64p_param_0+13]; -; CHECK-NEXT: or.b64 %rd17, %rd15, %rd16; -; CHECK-NEXT: ld.param.b8 %rd18, [test_s_i8f64p_param_0+15]; -; CHECK-NEXT: shl.b64 %rd19, %rd18, 16; -; CHECK-NEXT: ld.param.b8 %rd20, [test_s_i8f64p_param_0+16]; -; CHECK-NEXT: shl.b64 %rd21, %rd20, 24; -; CHECK-NEXT: or.b64 %rd22, %rd21, %rd19; -; CHECK-NEXT: or.b64 %rd23, %rd22, %rd17; -; CHECK-NEXT: shl.b64 %rd24, %rd23, 32; -; CHECK-NEXT: or.b64 %rd2, %rd24, %rd13; -; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8f64p_param_0+24]; -; CHECK-NEXT: ld.param.b8 %rs1, [test_s_i8f64p_param_0+8]; ; CHECK-NEXT: ld.param.b64 %rd1, [test_s_i8f64p_param_0]; -; CHECK-NEXT: shr.u64 %rd25, %rd2, 8; -; CHECK-NEXT: shr.u64 %rd26, %rd2, 16; -; CHECK-NEXT: shr.u64 %rd27, %rd2, 24; -; CHECK-NEXT: bfe.u64 %rd28, %rd23, 8, 24; -; CHECK-NEXT: bfe.u64 %rd29, %rd23, 16, 16; -; CHECK-NEXT: bfe.u64 %rd30, %rd23, 24, 8; +; CHECK-NEXT: ld.param.b64 %rd2, [test_s_i8f64p_param_0+8]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_s_i8f64p_param_0+24]; +; CHECK-NEXT: ld.param.b8 %rd4, [test_s_i8f64p_param_0+16]; ; CHECK-NEXT: { // callseq 6, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[32]; -; CHECK-NEXT: st.param.b64 [param0], %rd1; -; CHECK-NEXT: st.param.b8 [param0+8], %rs1; -; CHECK-NEXT: st.param.b8 [param0+9], %rd2; -; CHECK-NEXT: st.param.b8 [param0+10], %rd25; -; CHECK-NEXT: st.param.b8 [param0+11], %rd26; -; CHECK-NEXT: st.param.b8 [param0+12], %rd27; -; CHECK-NEXT: st.param.b8 [param0+13], %rd23; -; CHECK-NEXT: st.param.b8 [param0+14], %rd28; -; CHECK-NEXT: st.param.b8 [param0+15], %rd29; -; CHECK-NEXT: st.param.b8 [param0+16], %rd30; -; CHECK-NEXT: st.param.b64 [param0+24], %rd3; ; CHECK-NEXT: .param .align 8 .b8 retval0[32]; +; CHECK-NEXT: st.param.b8 [param0+16], %rd4; +; CHECK-NEXT: st.param.b64 [param0+24], %rd3; +; CHECK-NEXT: st.param.b64 [param0+8], %rd2; +; CHECK-NEXT: st.param.b64 [param0], %rd1; ; CHECK-NEXT: call.uni (retval0), test_s_i8f64p, (param0); -; CHECK-NEXT: ld.param.b64 %rd31, [retval0]; -; CHECK-NEXT: ld.param.b8 %rs2, [retval0+8]; -; CHECK-NEXT: ld.param.b8 %rs3, [retval0+9]; -; CHECK-NEXT: ld.param.b8 %rs4, [retval0+10]; -; CHECK-NEXT: ld.param.b8 %rs5, [retval0+11]; -; CHECK-NEXT: ld.param.b8 %rs6, [retval0+12]; -; CHECK-NEXT: ld.param.b8 %rs7, [retval0+13]; -; CHECK-NEXT: ld.param.b8 %rs8, [retval0+14]; -; CHECK-NEXT: ld.param.b8 %rs9, [retval0+15]; -; CHECK-NEXT: ld.param.b8 %rs10, [retval0+16]; -; CHECK-NEXT: ld.param.b64 %rd32, [retval0+24]; +; CHECK-NEXT: ld.param.b64 %rd5, [retval0+24]; +; CHECK-NEXT: ld.param.b8 %rs1, [retval0+8]; +; CHECK-NEXT: ld.param.b64 %rd6, [retval0]; +; CHECK-NEXT: ld.param.b8 %rd7, [retval0+16]; +; CHECK-NEXT: ld.param.b8 %rd8, [retval0+15]; +; CHECK-NEXT: ld.param.b8 %rd9, [retval0+14]; +; CHECK-NEXT: ld.param.b8 %rd10, [retval0+13]; +; CHECK-NEXT: ld.param.b8 %rd11, [retval0+12]; +; CHECK-NEXT: ld.param.b8 %rd12, [retval0+11]; +; CHECK-NEXT: ld.param.b8 %rd13, [retval0+10]; +; CHECK-NEXT: ld.param.b8 %rd14, [retval0+9]; ; CHECK-NEXT: } // callseq 6 -; CHECK-NEXT: cvt.u64.u16 %rd33, %rs3; -; CHECK-NEXT: and.b64 %rd34, %rd33, 255; -; CHECK-NEXT: cvt.u64.u16 %rd35, %rs4; -; CHECK-NEXT: and.b64 %rd36, %rd35, 255; -; CHECK-NEXT: shl.b64 %rd37, %rd36, 8; -; CHECK-NEXT: or.b64 %rd38, %rd34, %rd37; -; CHECK-NEXT: cvt.u64.u16 %rd39, %rs5; -; CHECK-NEXT: and.b64 %rd40, %rd39, 255; -; CHECK-NEXT: shl.b64 %rd41, %rd40, 16; -; CHECK-NEXT: or.b64 %rd42, %rd38, %rd41; -; CHECK-NEXT: cvt.u64.u16 %rd43, %rs6; -; CHECK-NEXT: and.b64 %rd44, %rd43, 255; -; CHECK-NEXT: shl.b64 %rd45, %rd44, 24; -; CHECK-NEXT: or.b64 %rd46, %rd42, %rd45; -; CHECK-NEXT: cvt.u64.u16 %rd47, %rs7; -; CHECK-NEXT: and.b64 %rd48, %rd47, 255; -; CHECK-NEXT: shl.b64 %rd49, %rd48, 32; -; CHECK-NEXT: or.b64 %rd50, %rd46, %rd49; -; CHECK-NEXT: cvt.u64.u16 %rd51, %rs8; -; CHECK-NEXT: and.b64 %rd52, %rd51, 255; -; CHECK-NEXT: shl.b64 %rd53, %rd52, 40; -; CHECK-NEXT: or.b64 %rd54, %rd50, %rd53; -; CHECK-NEXT: cvt.u64.u16 %rd55, %rs9; -; CHECK-NEXT: and.b64 %rd56, %rd55, 255; -; CHECK-NEXT: shl.b64 %rd57, %rd56, 48; -; CHECK-NEXT: or.b64 %rd58, %rd54, %rd57; -; CHECK-NEXT: cvt.u64.u16 %rd59, %rs10; -; CHECK-NEXT: shl.b64 %rd60, %rd59, 56; -; CHECK-NEXT: or.b64 %rd61, %rd58, %rd60; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd31; -; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs2; +; CHECK-NEXT: shl.b64 %rd17, %rd13, 8; +; CHECK-NEXT: or.b64 %rd18, %rd17, %rd14; +; CHECK-NEXT: shl.b64 %rd20, %rd12, 16; +; CHECK-NEXT: shl.b64 %rd22, %rd11, 24; +; CHECK-NEXT: or.b64 %rd23, %rd22, %rd20; +; CHECK-NEXT: or.b64 %rd24, %rd23, %rd18; +; CHECK-NEXT: shl.b64 %rd27, %rd9, 8; +; CHECK-NEXT: or.b64 %rd28, %rd27, %rd10; +; CHECK-NEXT: shl.b64 %rd30, %rd8, 16; +; CHECK-NEXT: shl.b64 %rd32, %rd7, 24; +; CHECK-NEXT: or.b64 %rd33, %rd32, %rd30; +; CHECK-NEXT: or.b64 %rd34, %rd33, %rd28; +; CHECK-NEXT: shl.b64 %rd35, %rd34, 32; +; CHECK-NEXT: or.b64 %rd36, %rd35, %rd24; +; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd14; +; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd5; +; CHECK-NEXT: st.param.b8 [func_retval0+8], %rs1; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; +; CHECK-NEXT: shr.u64 %rd39, %rd36, 56; +; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd39; +; CHECK-NEXT: shr.u64 %rd40, %rd36, 48; +; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd40; +; CHECK-NEXT: shr.u64 %rd41, %rd36, 40; +; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd41; +; CHECK-NEXT: shr.u64 %rd42, %rd36, 32; +; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd42; +; CHECK-NEXT: shr.u64 %rd43, %rd36, 24; ; CHECK-NEXT: st.param.b8 [func_retval0+12], %rd43; -; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd39; -; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd35; -; CHECK-NEXT: st.param.b8 [func_retval0+9], %rd33; -; CHECK-NEXT: shr.u64 %rd64, %rd50, 32; -; CHECK-NEXT: st.param.b8 [func_retval0+13], %rd64; -; CHECK-NEXT: shr.u64 %rd65, %rd54, 40; -; CHECK-NEXT: st.param.b8 [func_retval0+14], %rd65; -; CHECK-NEXT: shr.u64 %rd66, %rd58, 48; -; CHECK-NEXT: st.param.b8 [func_retval0+15], %rd66; -; CHECK-NEXT: shr.u64 %rd67, %rd61, 56; -; CHECK-NEXT: st.param.b8 [func_retval0+16], %rd67; -; CHECK-NEXT: st.param.b64 [func_retval0+24], %rd32; +; CHECK-NEXT: shr.u64 %rd44, %rd36, 16; +; CHECK-NEXT: st.param.b8 [func_retval0+11], %rd44; +; CHECK-NEXT: shr.u64 %rd45, %rd36, 8; +; CHECK-NEXT: st.param.b8 [func_retval0+10], %rd45; ; CHECK-NEXT: ret; %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) ret %s_i8f64p %r diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll index 3ca729f..9e312a2 100644 --- a/llvm/test/CodeGen/NVPTX/vaargs.ll +++ b/llvm/test/CodeGen/NVPTX/vaargs.ll @@ -89,14 +89,14 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) { ; CHECK-NEXT: ld.param.b32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0]; ; Store arguments to an array -; CHECK32: .param .align 8 .b8 param1[28]; -; CHECK64: .param .align 8 .b8 param1[32]; -; CHECK-NEXT: st.param.b32 [param1], [[ARG_I32]]; -; CHECK-NEXT: st.param.b64 [param1+8], [[ARG_I64]]; -; CHECK-NEXT: st.param.b64 [param1+16], [[ARG_DOUBLE]]; -; CHECK-NEXT: st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]]; -; CHECK-NEXT: .param .b32 retval0; -; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[] +; CHECK32: .param .align 8 .b8 param1[28]; +; CHECK64: .param .align 8 .b8 param1[32]; +; CHECK-DAG: .param .b32 retval0; +; CHECK-DAG: st.param.b32 [param1], [[ARG_I32]]; +; CHECK-DAG: st.param.b64 [param1+8], [[ARG_I64]]; +; CHECK-DAG: st.param.b64 [param1+16], [[ARG_DOUBLE]]; +; CHECK-DAG: st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]]; +; CHECK-DAG: prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[] entry: %ptr = load ptr, ptr addrspacecast (ptr addrspace(1) @foo_ptr to ptr), align 8 diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index ad2e704..a9b3675 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -115,13 +115,13 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408; ; CHECK-PTX-NEXT: st.b64 [%SP+32], 4607182418800017408; -; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; ; CHECK-PTX-NEXT: { // callseq 0, 0 ; CHECK-PTX-NEXT: .param .b32 param0; -; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; ; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; +; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics1, (param0, param1); ; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-PTX-NEXT: } // callseq 0 @@ -218,13 +218,13 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: st.b32 [%SP+8], 1; ; CHECK-PTX-NEXT: st.b8 [%SP+12], 1; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; -; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8; ; CHECK-PTX-NEXT: { // callseq 1, 0 ; CHECK-PTX-NEXT: .param .b32 param0; -; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3; ; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: add.u64 %rd3, %SP, 8; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd3; +; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics2, (param0, param1); ; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-PTX-NEXT: } // callseq 1 @@ -289,13 +289,13 @@ define dso_local i32 @baz() { ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot5; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: st.v4.b32 [%SP], {1, 1, 1, 1}; -; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; ; CHECK-PTX-NEXT: { // callseq 2, 0 ; CHECK-PTX-NEXT: .param .b32 param0; -; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; ; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1; +; CHECK-PTX-NEXT: st.param.b32 [param0], 1; ; CHECK-PTX-NEXT: call.uni (retval0), variadics3, (param0, param1); ; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-PTX-NEXT: } // callseq 2 @@ -348,7 +348,6 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24]; ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; -; CHECK-PTX-NEXT: .reg .b32 %r<2>; ; CHECK-PTX-NEXT: .reg .b64 %rd<8>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry @@ -360,18 +359,17 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: ld.global.nc.b64 %rd4, [__const_$_qux_$_s]; ; CHECK-PTX-NEXT: st.local.b64 [%rd2], %rd4; ; CHECK-PTX-NEXT: st.b64 [%SP+16], 1; -; CHECK-PTX-NEXT: ld.local.b64 %rd5, [%rd2]; -; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2+8]; -; CHECK-PTX-NEXT: add.u64 %rd7, %SP, 16; ; CHECK-PTX-NEXT: { // callseq 3, 0 ; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; -; CHECK-PTX-NEXT: st.param.b64 [param0], %rd5; -; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd6; ; CHECK-PTX-NEXT: .param .b64 param1; -; CHECK-PTX-NEXT: st.param.b64 [param1], %rd7; ; CHECK-PTX-NEXT: .param .b32 retval0; +; CHECK-PTX-NEXT: add.u64 %rd5, %SP, 16; +; CHECK-PTX-NEXT: st.param.b64 [param1], %rd5; +; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2+8]; +; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd6; +; CHECK-PTX-NEXT: ld.local.b64 %rd7, [%rd2]; +; CHECK-PTX-NEXT: st.param.b64 [param0], %rd7; ; CHECK-PTX-NEXT: call.uni (retval0), variadics4, (param0, param1); -; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0]; ; CHECK-PTX-NEXT: } // callseq 3 ; CHECK-PTX-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll index e6a98c9..eb3422d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv32.ll @@ -2,4246 +2,3303 @@ ; RUN: llc -mtriple=riscv32 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i8> @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0 } - -define <vscale x 1 x i8> @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i8> @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 2 x i8> @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i8> @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 4 x i8> @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i8> @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 8 x i8> @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 16 x i1>, i32, i32, i32) - -define <vscale x 16 x i8> @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg2e8.v v6, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 16 x i8> @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg2e8.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 32 x i1>, i32, i32, i32) - -define <vscale x 32 x i8> @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vlseg2e8.v v4, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 32 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 32 x i8> @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 32 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 32 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vlseg2e8.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 32 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 32 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i8> @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0 } - -define <vscale x 1 x i8> @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i8> @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 2 x i8> @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i8> @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 4 x i8> @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i8> @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 8 x i8> @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 16 x i1>, i32, i32, i32) - -define <vscale x 16 x i8> @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg3e8.v v6, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 16 x i8> @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg3e8.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i8> @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0 } - -define <vscale x 1 x i8> @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i8> @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 2 x i8> @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i8> @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 4 x i8> @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i8> @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 8 x i8> @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 16 x i1>, i32, i32, i32) - -define <vscale x 16 x i8> @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg4e8.v v6, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 16 x i8> @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg4e8.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i8> @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0 } - -define <vscale x 1 x i8> @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i8> @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 2 x i8> @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i8> @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 4 x i8> @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i8> @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 8 x i8> @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i8> @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0 } - -define <vscale x 1 x i8> @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i8> @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 2 x i8> @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i8> @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 4 x i8> @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i8> @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 8 x i8> @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i8> @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0 } - -define <vscale x 1 x i8> @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i8> @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 2 x i8> @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i8> @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 4 x i8> @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i8> @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 8 x i8> @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i8> @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0 } - -define <vscale x 1 x i8> @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i8> @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 2 x i8> @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i8> @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 4 x i8> @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, i32, i32) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i8> @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 8 x i8> @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i16> @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 1 x i16> @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i16> @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 2 x i16> @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i16> @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 4 x i16> @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i16> @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 8 x i16> @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 16 x i1>, i32, i32, i32) - -define <vscale x 16 x i16> @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 16 x i16> @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i16> @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 1 x i16> @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i16> @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 2 x i16> @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i16> @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 4 x i16> @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i16> @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 8 x i16> @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i16> @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 1 x i16> @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i16> @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 2 x i16> @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i16> @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 4 x i16> @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i16> @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 8 x i16> @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i16> @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 1 x i16> @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i16> @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 2 x i16> @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i16> @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 4 x i16> @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i16> @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 1 x i16> @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i16> @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 2 x i16> @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i16> @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 4 x i16> @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i16> @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 1 x i16> @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i16> @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 2 x i16> @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i16> @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 4 x i16> @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i16> @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 1 x i16> @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i16> @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 2 x i16> @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i16> @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 4 x i16> @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i32> @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 1 x i32> @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i32> @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 2 x i32> @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i32> @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 4 x i32> @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 8 x i1>, i32, i32, i32) - -define <vscale x 8 x i32> @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 8 x i32> @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i32> @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 1 x i32> @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i32> @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 2 x i32> @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i32> @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 4 x i32> @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i32> @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 1 x i32> @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i32> @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 2 x i32> @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i32> @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 4 x i32> @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i32> @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 1 x i32> @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i32> @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 2 x i32> @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i32> @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 1 x i32> @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i32> @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 2 x i32> @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i32> @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 1 x i32> @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i32> @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 2 x i32> @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i32> @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 1 x i32> @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i32> @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 2 x i32> @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i64> @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 1 x i64> @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i64> @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 2 x i64> @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 4 x i1>, i32, i32, i32) - -define <vscale x 4 x i64> @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 4 x i64> @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i64> @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 1 x i64> @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i64> @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 2 x i64> @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i64> @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 1 x i64> @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 2 x i1>, i32, i32, i32) - -define <vscale x 2 x i64> @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 2 x i64> @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i64> @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0) +; CHECK-NEXT: vlseg5e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 1 x i64> @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i64> @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0) +; CHECK-NEXT: vlseg6e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 1 x i64> @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i64> @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0) +; CHECK-NEXT: vlseg7e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 1 x i64> @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 1 x i1>, i32, i32, i32) - -define <vscale x 1 x i64> @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0) +; CHECK-NEXT: vlseg8e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 1 x i64> @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x half> @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 1 x half> @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - - -define <vscale x 2 x half> @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 2 x half> @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - - -define <vscale x 4 x half> @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 4 x half> @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 8 x half> @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 8 x half> @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 16 x half> @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x half> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 16 x half> @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x half> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x half> @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 1 x half> @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - - -define <vscale x 2 x half> @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 2 x half> @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - - -define <vscale x 4 x half> @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 4 x half> @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 8 x half> @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 8 x half> @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x half> @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 1 x half> @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - - -define <vscale x 2 x half> @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 2 x half> @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - - -define <vscale x 4 x half> @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 4 x half> @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 8 x half> @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 8 x half> @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x half> @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 1 x half> @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - - -define <vscale x 2 x half> @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 2 x half> @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - - -define <vscale x 4 x half> @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 4 x half> @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x half> @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 1 x half> @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - - -define <vscale x 2 x half> @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 2 x half> @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - - -define <vscale x 4 x half> @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 4 x half> @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x half> @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 1 x half> @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - - -define <vscale x 2 x half> @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 2 x half> @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - - -define <vscale x 4 x half> @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 4 x half> @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x half> @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 1 x half> @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - - -define <vscale x 2 x half> @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 2 x half> @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - - -define <vscale x 4 x half> @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 4 x half> @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x float> @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 1 x float> @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - - -define <vscale x 2 x float> @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 2 x float> @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 4 x float> @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 4 x float> @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 8 x float> @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x float> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 8 x float> @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x float> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x float> @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 1 x float> @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - - -define <vscale x 2 x float> @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 2 x float> @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 4 x float> @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 4 x float> @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x float> @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 1 x float> @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - - -define <vscale x 2 x float> @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 2 x float> @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 4 x float> @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 4 x float> @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x float> @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 1 x float> @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - - -define <vscale x 2 x float> @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 2 x float> @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x float> @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 1 x float> @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - - -define <vscale x 2 x float> @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 2 x float> @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x float> @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 1 x float> @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - - -define <vscale x 2 x float> @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 2 x float> @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x float> @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 1 x float> @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - - -define <vscale x 2 x float> @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 2 x float> @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x double> @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 1 x double> @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 2 x double> @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 2 x double> @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 4 x double> @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x double> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 4 x double> @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x double> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x double> @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 1 x double> @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 2 x double> @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 2 x double> @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x double> @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 1 x double> @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 2 x double> @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 2 x double> @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x double> @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0) +; CHECK-NEXT: vlseg5e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 1 x double> @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x double> @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0) +; CHECK-NEXT: vlseg6e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 1 x double> @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x double> @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0) +; CHECK-NEXT: vlseg7e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 1 x double> @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x double> @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0) +; CHECK-NEXT: vlseg8e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 1 x double> @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 1 x bfloat> @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 2 x bfloat> @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 4 x bfloat> @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 8 x bfloat> @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 8 x bfloat> @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 16 x bfloat> @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 16 x bfloat> @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i32 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 1 x bfloat> @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 2 x bfloat> @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 4 x bfloat> @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 8 x bfloat> @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 8 x bfloat> @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 1 x bfloat> @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 2 x bfloat> @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 4 x bfloat> @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 8 x bfloat> @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 8 x bfloat> @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i32 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 1 x bfloat> @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 2 x bfloat> @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 4 x bfloat> @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 1 x bfloat> @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 2 x bfloat> @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 4 x bfloat> @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 1 x bfloat> @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 2 x bfloat> @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 4 x bfloat> @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 1 x bfloat> @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i32 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 2 x bfloat> @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i32 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i32 %vl, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 4 x bfloat> @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i32 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i32 %vl, i32 1, i32 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - diff --git a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll index 16e5e7b9..faeabaf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vlseg-rv64.ll @@ -2,4330 +2,3373 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zve64d,+f,+d,+zvfh,+zvfbfmin \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i8> @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0 } - -define <vscale x 1 x i8> @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0 } - -define <vscale x 1 x i8> @test_vlseg2_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @test_vlseg2_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_2t(target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i8> @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 2 x i8> @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i8> @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 4 x i8> @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i8> @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 8 x i8> @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg2e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 16 x i1>, i64, i64, i64) - -define <vscale x 16 x i8> @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16i8_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg2e8.v v6, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 16 x i8> @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16i8_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg2e8.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 32 x i1>, i64, i64, i64) - -define <vscale x 32 x i8> @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv32i8_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vlseg2e8.v v4, (a0) +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 32 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 32 x i8> @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 32 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 32 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv32i8_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vlseg2e8.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv32i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 32 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 32 x i8> @llvm.riscv.tuple.extract.nxv32i8.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 32 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i8> @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0 } - -define <vscale x 1 x i8> @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0 } - -define <vscale x 1 x i8> @test_vlseg3_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @test_vlseg3_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_3t(target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i8> @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 2 x i8> @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i8> @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 4 x i8> @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i8> @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 8 x i8> @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg3e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 16 x i1>, i64, i64, i64) - -define <vscale x 16 x i8> @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg3e8.v v6, (a0) +; CHECK-NEXT: vlseg3e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 16 x i8> @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg3e8.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i8> @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0 } - -define <vscale x 1 x i8> @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0 } - -define <vscale x 1 x i8> @test_vlseg4_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @test_vlseg4_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_4t(target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i8> @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 2 x i8> @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i8> @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 4 x i8> @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i8> @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 8 x i8> @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg4e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 16 x i1>, i64, i64, i64) - -define <vscale x 16 x i8> @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg4e8.v v6, (a0) +; CHECK-NEXT: vlseg4e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 16 x i8> @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vlseg4e8.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 16 x i8> @llvm.riscv.tuple.extract.nxv16i8.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 16 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i8> @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0 } - -define <vscale x 1 x i8> @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0 } - -define <vscale x 1 x i8> @test_vlseg5_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @test_vlseg5_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_5t(target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i8> @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 2 x i8> @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i8> @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 4 x i8> @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i8> @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0) +; CHECK-NEXT: vlseg5e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 8 x i8> @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg5e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i8> @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0 } - -define <vscale x 1 x i8> @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0 } - -define <vscale x 1 x i8> @test_vlseg6_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @test_vlseg6_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_6t(target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i8> @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 2 x i8> @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i8> @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 4 x i8> @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i8> @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0) +; CHECK-NEXT: vlseg6e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 8 x i8> @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg6e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i8> @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0 } - -define <vscale x 1 x i8> @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0 } - -define <vscale x 1 x i8> @test_vlseg7_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @test_vlseg7_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_7t(target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i8> @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 2 x i8> @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i8> @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 4 x i8> @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i8> @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0) +; CHECK-NEXT: vlseg7e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 8 x i8> @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg7e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i8> @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0 } - -define <vscale x 1 x i8> @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0 } - -define <vscale x 1 x i8> @test_vlseg8_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @test_vlseg8_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 1 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> splat (i1 true), i64 %vl, i64 1, i64 3) - %1 = call <vscale x 1 x i8> @llvm.riscv.tuple.extract.nxv1i8.triscv.vector.tuple_nxv1i8_8t(target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 1 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i8> @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 2 x i8> @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 2 x i8> @llvm.riscv.tuple.extract.nxv2i8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i8> @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 4 x i8> @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 4 x i8> @llvm.riscv.tuple.extract.nxv4i8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, i64, i64) -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i8> @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0) +; CHECK-NEXT: vlseg8e8.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 8 x i8> @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vlseg8e8.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e8.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 3) - %1 = call <vscale x 8 x i8> @llvm.riscv.tuple.extract.nxv8i8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 8 x i8> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i16> @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 1 x i16> @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i16> @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 2 x i16> @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i16> @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 4 x i16> @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i16> @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8i16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 8 x i16> @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8i16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 16 x i1>, i64, i64, i64) - -define <vscale x 16 x i16> @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16i16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 16 x i16> @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16i16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 16 x i16> @llvm.riscv.tuple.extract.nxv16i16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i16> @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 1 x i16> @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i16> @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 2 x i16> @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i16> @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 4 x i16> @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i16> @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 8 x i16> @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i16> @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 1 x i16> @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i16> @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 2 x i16> @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i16> @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 4 x i16> @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i16> @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 8 x i16> @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x i16> @llvm.riscv.tuple.extract.nxv8i16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i16> @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 1 x i16> @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i16> @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 2 x i16> @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i16> @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 4 x i16> @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i16> @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 1 x i16> @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i16> @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 2 x i16> @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i16> @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 4 x i16> @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i16> @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 1 x i16> @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i16> @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 2 x i16> @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i16> @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 4 x i16> @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i16> @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 1 x i16> @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x i16> @llvm.riscv.tuple.extract.nxv1i16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i16> @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 2 x i16> @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x i16> @llvm.riscv.tuple.extract.nxv2i16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i16> @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 4 x i16> @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x i16> @llvm.riscv.tuple.extract.nxv4i16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x i16> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i32> @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 1 x i32> @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i32> @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 2 x i32> @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i32> @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 4 x i32> @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 8 x i1>, i64, i64, i64) - -define <vscale x 8 x i32> @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8i32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 8 x i32> @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8i32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 8 x i32> @llvm.riscv.tuple.extract.nxv8i32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i32> @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 1 x i32> @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i32> @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 2 x i32> @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i32> @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 4 x i32> @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i32> @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 1 x i32> @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i32> @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 2 x i32> @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i32> @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 4 x i32> @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i32> @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 1 x i32> @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i32> @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 2 x i32> @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i32> @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 1 x i32> @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i32> @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 2 x i32> @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i32> @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 1 x i32> @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i32> @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 2 x i32> @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i32> @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 1 x i32> @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x i32> @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i32> @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 2 x i32> @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x i32> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i64> @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 1 x i64> @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i64> @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2i64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 2 x i64> @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2i64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2), ptr, <vscale x 4 x i1>, i64, i64, i64) - -define <vscale x 4 x i64> @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4i64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 4 x i64> @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4i64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 4 x i64> @llvm.riscv.tuple.extract.nxv4i64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i64> @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 1 x i64> @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i64> @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 2 x i64> @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i64> @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 1 x i64> @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4), ptr, <vscale x 2 x i1>, i64, i64, i64) - -define <vscale x 2 x i64> @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 2 x i64> @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 2 x i64> @llvm.riscv.tuple.extract.nxv2i64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i64> @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0) +; CHECK-NEXT: vlseg5e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 1 x i64> @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i64> @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0) +; CHECK-NEXT: vlseg6e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 1 x i64> @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i64> @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0) +; CHECK-NEXT: vlseg7e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 1 x i64> @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -declare target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8), ptr, <vscale x 1 x i1>, i64, i64, i64) - -define <vscale x 1 x i64> @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0) +; CHECK-NEXT: vlseg8e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 1 x i64> @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x i64> @llvm.riscv.tuple.extract.nxv1i64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x i64> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x half> @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 1 x half> @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - - -define <vscale x 2 x half> @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 2 x half> @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - - -define <vscale x 4 x half> @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 4 x half> @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 8 x half> @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8f16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 8 x half> @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8f16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 16 x half> @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16f16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x half> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 16 x half> @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16f16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 16 x half> @llvm.riscv.tuple.extract.nxv16f16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x half> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x half> @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 1 x half> @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - - -define <vscale x 2 x half> @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 2 x half> @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - - -define <vscale x 4 x half> @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 4 x half> @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 8 x half> @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 8 x half> @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x half> @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 1 x half> @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - - -define <vscale x 2 x half> @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 2 x half> @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - - -define <vscale x 4 x half> @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 4 x half> @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 8 x half> @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 8 x half> @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x half> @llvm.riscv.tuple.extract.nxv8f16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x half> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x half> @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 1 x half> @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - - -define <vscale x 2 x half> @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 2 x half> @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - - -define <vscale x 4 x half> @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 4 x half> @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x half> @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 1 x half> @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - - -define <vscale x 2 x half> @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 2 x half> @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - - -define <vscale x 4 x half> @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 4 x half> @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x half> @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 1 x half> @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - - -define <vscale x 2 x half> @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 2 x half> @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - - -define <vscale x 4 x half> @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 4 x half> @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x half> @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 1 x half> @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x half> @llvm.riscv.tuple.extract.nxv1f16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x half> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - - -define <vscale x 2 x half> @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 2 x half> @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x half> @llvm.riscv.tuple.extract.nxv2f16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x half> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - - -define <vscale x 4 x half> @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 4 x half> @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x half> @llvm.riscv.tuple.extract.nxv4f16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x half> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x float> @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 1 x float> @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - - -define <vscale x 2 x float> @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 2 x float> @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg2e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 4 x float> @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4f32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 4 x float> @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4f32_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg2e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 8 x float> @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8f32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0) +; CHECK-NEXT: vlseg2e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x float> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 8 x float> @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8f32_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; CHECK-NEXT: vlseg2e32.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 8 x float> @llvm.riscv.tuple.extract.nxv8f32.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 8 x float> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x float> @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 1 x float> @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - - -define <vscale x 2 x float> @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 2 x float> @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 4 x float> @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0) +; CHECK-NEXT: vlseg3e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 4 x float> @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg3e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x float> @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 1 x float> @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - - -define <vscale x 2 x float> @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 2 x float> @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 4 x float> @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0) +; CHECK-NEXT: vlseg4e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 4 x float> @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vlseg4e32.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 4 x float> @llvm.riscv.tuple.extract.nxv4f32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 4 x float> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x float> @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 1 x float> @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - - -define <vscale x 2 x float> @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0) +; CHECK-NEXT: vlseg5e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 2 x float> @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x float> @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 1 x float> @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - - -define <vscale x 2 x float> @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0) +; CHECK-NEXT: vlseg6e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 2 x float> @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg6e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x float> @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 1 x float> @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - - -define <vscale x 2 x float> @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0) +; CHECK-NEXT: vlseg7e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 2 x float> @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg7e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x float> @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 1 x float> @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 1 x float> @llvm.riscv.tuple.extract.nxv1f32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 1 x float> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - - -define <vscale x 2 x float> @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0) +; CHECK-NEXT: vlseg8e32.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 2 x float> @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vlseg8e32.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e32.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 5) - %1 = call <vscale x 2 x float> @llvm.riscv.tuple.extract.nxv2f32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 2 x float> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x double> @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 1 x double> @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg2e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 2 x double> @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2f64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 2 x double> @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2f64_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg2e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 4 x double> @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4f64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0) +; CHECK-NEXT: vlseg2e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x double> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 4 x double> @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4f64_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; CHECK-NEXT: vlseg2e64.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 4 x double> @llvm.riscv.tuple.extract.nxv4f64.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 4 x double> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x double> @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 1 x double> @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg3e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 2 x double> @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0) +; CHECK-NEXT: vlseg3e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 2 x double> @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg3e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x double> @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 1 x double> @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg4e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 2 x double> @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0) +; CHECK-NEXT: vlseg4e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 2 x double> @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vlseg4e64.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 2 x double> @llvm.riscv.tuple.extract.nxv2f64.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 2 x double> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x double> @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0) +; CHECK-NEXT: vlseg5e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 1 x double> @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg5e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x double> @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0) +; CHECK-NEXT: vlseg6e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 1 x double> @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg6e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x double> @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0) +; CHECK-NEXT: vlseg7e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 1 x double> @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg7e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x double> @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0) +; CHECK-NEXT: vlseg8e64.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 1 x double> @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vlseg8e64.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e64.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 6) - %1 = call <vscale x 1 x double> @llvm.riscv.tuple.extract.nxv1f64.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 1 x double> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - -define <vscale x 1 x bfloat> @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_2t(target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 2) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - -define <vscale x 2 x bfloat> @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 2) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - -define <vscale x 4 x bfloat> @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg2e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_2t(target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 2) %0 } - - -define <vscale x 8 x bfloat> @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv8bf16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - -define <vscale x 8 x bfloat> @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg2e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv16i8_2t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_2t(target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 2) %0 } - - -define <vscale x 16 x bfloat> @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg2_nxv16bf16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0) +; CHECK-NEXT: vlseg2e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - -define <vscale x 16 x bfloat> @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t(ptr %base, i64 %vl, <vscale x 16 x i1> %mask) { ; CHECK-LABEL: test_vlseg2_mask_nxv16bf16_triscv.vector.tuple_nxv32i8_2t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vlseg2e16.v v4, (a0), v0.t +; CHECK-NEXT: vlseg2e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 32 x i8>, 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv32i8_2t.nxv16i1(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) undef, ptr %base, <vscale x 16 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 16 x bfloat> @llvm.riscv.tuple.extract.nxv16bf16.triscv.vector.tuple_nxv32i8_2t(target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0, i32 1) - ret <vscale x 16 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 32 x i8>, 2) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - -define <vscale x 1 x bfloat> @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_3t(target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 3) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - -define <vscale x 2 x bfloat> @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 3) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - -define <vscale x 4 x bfloat> @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg3e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 3) %0 } - - -define <vscale x 8 x bfloat> @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0) +; CHECK-NEXT: vlseg3e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - -define <vscale x 8 x bfloat> @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg3e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg3e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_3t(target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 3) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - -define <vscale x 1 x bfloat> @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_4t(target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 4) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - -define <vscale x 2 x bfloat> @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 4) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - -define <vscale x 4 x bfloat> @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg4e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_4t(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 4) %0 } - - -define <vscale x 8 x bfloat> @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0) +; CHECK-NEXT: vlseg4e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - -define <vscale x 8 x bfloat> @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t(ptr %base, i64 %vl, <vscale x 8 x i1> %mask) { ; CHECK-LABEL: test_vlseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vlseg4e16.v v6, (a0), v0.t +; CHECK-NEXT: vlseg4e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) undef, ptr %base, <vscale x 8 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 8 x bfloat> @llvm.riscv.tuple.extract.nxv8bf16.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0, i32 1) - ret <vscale x 8 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 16 x i8>, 4) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - -define <vscale x 1 x bfloat> @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_5t(target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 5) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - -define <vscale x 2 x bfloat> @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 5) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0) +; CHECK-NEXT: vlseg5e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - -define <vscale x 4 x bfloat> @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg5e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg5e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 5) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - -define <vscale x 1 x bfloat> @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_6t(target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 6) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - -define <vscale x 2 x bfloat> @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 6) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0) +; CHECK-NEXT: vlseg6e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - -define <vscale x 4 x bfloat> @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg6e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg6e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_6t(target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 6) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - -define <vscale x 1 x bfloat> @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_7t(target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 7) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - -define <vscale x 2 x bfloat> @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 7) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0) +; CHECK-NEXT: vlseg7e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - -define <vscale x 4 x bfloat> @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg7e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg7e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 7) %0 } - - -define <vscale x 1 x bfloat> @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - -define <vscale x 1 x bfloat> @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t(ptr %base, i64 %vl, <vscale x 1 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 2 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) undef, ptr %base, <vscale x 1 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 1 x bfloat> @llvm.riscv.tuple.extract.nxv1bf16.triscv.vector.tuple_nxv2i8_8t(target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0, i32 1) - ret <vscale x 1 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 2 x i8>, 8) %0 } - - -define <vscale x 2 x bfloat> @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - -define <vscale x 2 x bfloat> @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t(ptr %base, i64 %vl, <vscale x 2 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 4 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) undef, ptr %base, <vscale x 2 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 2 x bfloat> @llvm.riscv.tuple.extract.nxv2bf16.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0, i32 1) - ret <vscale x 2 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 4 x i8>, 8) %0 } - - -define <vscale x 4 x bfloat> @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl) { ; CHECK-LABEL: test_vlseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0) +; CHECK-NEXT: vlseg8e16.v v8, (a0) ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, i64 %vl, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - -define <vscale x 4 x bfloat> @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { +define target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t(ptr %base, i64 %vl, <vscale x 4 x i1> %mask) { ; CHECK-LABEL: test_vlseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vlseg8e16.v v7, (a0), v0.t +; CHECK-NEXT: vlseg8e16.v v8, (a0), v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) undef, ptr %base, <vscale x 4 x i1> %mask, i64 %vl, i64 1, i64 4) - %1 = call <vscale x 4 x bfloat> @llvm.riscv.tuple.extract.nxv4bf16.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0, i32 1) - ret <vscale x 4 x bfloat> %1 + ret target("riscv.vector.tuple", <vscale x 8 x i8>, 8) %0 } - diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll new file mode 100644 index 0000000..6e2d860 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128, | FileCheck %s --check-prefix=STRICT + +target triple = "wasm32" + +define double @fsub_fmul_contract_f64(double %a, double %b, double %c) { +; RELAXED-LABEL: fsub_fmul_contract_f64: +; RELAXED: .functype fsub_fmul_contract_f64 (f64, f64, f64) -> (f64) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f64.mul $push0=, $1, $0 +; RELAXED-NEXT: f64.sub $push1=, $2, $pop0 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fsub_fmul_contract_f64: +; STRICT: .functype fsub_fmul_contract_f64 (f64, f64, f64) -> (f64) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f64.mul $push0=, $1, $0 +; STRICT-NEXT: f64.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %mul = fmul contract double %b, %a + %sub = fsub contract double %c, %mul + ret double %sub +} + +define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; RELAXED-LABEL: fsub_fmul_contract_4xf32: +; RELAXED: .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $2, $1, $0 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fsub_fmul_contract_4xf32: +; STRICT: .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32x4.mul $push0=, $1, $0 +; STRICT-NEXT: f32x4.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %mul = fmul contract <4 x float> %b, %a + %sub = fsub contract <4 x float> %c, %mul + ret <4 x float> %sub +} + + +define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) { +; RELAXED-LABEL: fsub_fmul_contract_8xf16: +; RELAXED: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f16x8.relaxed_nmadd $push0=, $2, $1, $0 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fsub_fmul_contract_8xf16: +; STRICT: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f16x8.mul $push0=, $1, $0 +; STRICT-NEXT: f16x8.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %mul = fmul contract <8 x half> %b, %a + %sub = fsub contract <8 x half> %c, %mul + ret <8 x half> %sub +} + + +define <4 x float> @fsub_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; RELAXED-LABEL: fsub_fmul_4xf32: +; RELAXED: .functype fsub_fmul_4xf32 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32x4.mul $push0=, $1, $0 +; RELAXED-NEXT: f32x4.sub $push1=, $2, $pop0 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fsub_fmul_4xf32: +; STRICT: .functype fsub_fmul_4xf32 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32x4.mul $push0=, $1, $0 +; STRICT-NEXT: f32x4.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %mul = fmul <4 x float> %b, %a + %sub = fsub contract <4 x float> %c, %mul + ret <4 x float> %sub +} + +define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) { +; RELAXED-LABEL: fsub_fmul_contract_8xf32: +; RELAXED: .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $6, $4, $2 +; RELAXED-NEXT: v128.store 16($0), $pop0 +; RELAXED-NEXT: f32x4.relaxed_nmadd $push1=, $5, $3, $1 +; RELAXED-NEXT: v128.store 0($0), $pop1 +; RELAXED-NEXT: return +; +; STRICT-LABEL: fsub_fmul_contract_8xf32: +; STRICT: .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> () +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32x4.mul $push0=, $4, $2 +; STRICT-NEXT: f32x4.sub $push1=, $6, $pop0 +; STRICT-NEXT: v128.store 16($0), $pop1 +; STRICT-NEXT: f32x4.mul $push2=, $3, $1 +; STRICT-NEXT: f32x4.sub $push3=, $5, $pop2 +; STRICT-NEXT: v128.store 0($0), $pop3 +; STRICT-NEXT: return + %mul = fmul contract <8 x float> %b, %a + %sub = fsub contract <8 x float> %c, %mul + ret <8 x float> %sub +} + + +define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; RELAXED-LABEL: fsub_fmul_contract_2xf64: +; RELAXED: .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $2, $1, $0 +; RELAXED-NEXT: return $pop0 +; +; STRICT-LABEL: fsub_fmul_contract_2xf64: +; STRICT: .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f64x2.mul $push0=, $1, $0 +; STRICT-NEXT: f64x2.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %mul = fmul contract <2 x double> %b, %a + %sub = fsub contract <2 x double> %c, %mul + ret <2 x double> %sub +} + +define float @fsub_fmul_contract_f32(float %a, float %b, float %c) { +; RELAXED-LABEL: fsub_fmul_contract_f32: +; RELAXED: .functype fsub_fmul_contract_f32 (f32, f32, f32) -> (f32) +; RELAXED-NEXT: # %bb.0: +; RELAXED-NEXT: f32.mul $push0=, $1, $0 +; RELAXED-NEXT: f32.sub $push1=, $2, $pop0 +; RELAXED-NEXT: return $pop1 +; +; STRICT-LABEL: fsub_fmul_contract_f32: +; STRICT: .functype fsub_fmul_contract_f32 (f32, f32, f32) -> (f32) +; STRICT-NEXT: # %bb.0: +; STRICT-NEXT: f32.mul $push0=, $1, $0 +; STRICT-NEXT: f32.sub $push1=, $2, $pop0 +; STRICT-NEXT: return $pop1 + %mul = fmul contract float %b, %a + %sub = fsub contract float %c, %mul + ret float %sub +} + diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll index 8a9052c..fa42481 100644 --- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll +++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll @@ -6,16 +6,12 @@ ; CHECK: .visible .func use_dbg_declare() ; CHECK: .local .align 8 .b8 __local_depot0[8]; ; CHECK: mov.b64 %SPL, __local_depot0; -; CHECK: add.u64 %rd1, %SP, 0; ; CHECK: .loc 1 5 3 // t.c:5:3 ; CHECK: { // callseq 0, 0 ; CHECK: .param .b64 param0; +; CHECK: add.u64 %rd1, %SP, 0; ; CHECK: st.param.b64 [param0], %rd1; -; CHECK: call.uni -; CHECK: escape_foo, -; CHECK: ( -; CHECK: param0 -; CHECK: ); +; CHECK: call.uni escape_foo, (param0); ; CHECK: } // callseq 0 ; CHECK: .loc 1 6 1 // t.c:6:1 ; CHECK: ret; diff --git a/llvm/test/DebugInfo/X86/branch-folder-dbg-after-end.mir b/llvm/test/DebugInfo/X86/branch-folder-dbg-after-end.mir new file mode 100644 index 0000000..743851c --- /dev/null +++ b/llvm/test/DebugInfo/X86/branch-folder-dbg-after-end.mir @@ -0,0 +1,108 @@ +# RUN: llc %s --start-before=branch-folder --stop-after=branch-folder -o - \ +# RUN: | FileCheck %s --implicit-check-not=DBG_PHI + +## Common instructions are hoisted. Check that trailing debug instructions in +## the range are also hoisted, and don't cause a crash. +## +## Note the MIR doesn't match the IR as it's modified from: +## /home/och/dev/llvm-project/llvm/test/DebugInfo/X86/branch-folder-dbg.mir + +# CHECK: bb.0 +# CHECK: CALL64pcrel32 @f, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax +## --- Start splice from bb.2.if.else (and debug instructions from bb.1.if.then) --- +# CHECK-NEXT: $edi = MOV32r0 implicit-def dead $eflags, debug-location !DILocation(line: 0, scope: ![[#]]) +# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(), debug-location +# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(), debug-location +## --- End splice ------------------------------------------------------------------ +# CHECK-NEXT: TEST64rr killed renamable $rax, renamable $rax, implicit-def $eflags +# CHECK-NEXT: JCC_1 %bb.2, 8, implicit $eflags +# CHECK: bb.1 + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-linux-gnu" + + declare dso_local noundef i64 @f() local_unnamed_addr + + define dso_local noundef i32 @g() local_unnamed_addr !dbg !7 { + %call = tail call noundef i64 @f() + %cmp1 = icmp sgt i64 0, %call + %conv2 = trunc i64 0 to i32 + br i1 %cmp1, label %if.then, label %if.else + + if.then: ; preds = %0 + tail call void @_Z3fooii(i32 noundef %conv2, i32 noundef 0), !dbg !14 + br label %if.end, !dbg !15 + + if.else: ; preds = %0 + tail call void @_Z3barii(i32 noundef %conv2, i32 noundef 1), !dbg !16 + br label %if.end, !dbg !17 + + if.end: ; preds = %if.else, %if.then + ret i32 2 + } + + declare void @_Z3fooii(i32 noundef, i32 noundef) local_unnamed_addr + + declare void @_Z3barii(i32 noundef, i32 noundef) local_unnamed_addr + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + !llvm.dbg.cu = !{!3} + !llvm.debugify = !{!5, !6} + + !0 = !{i32 7, !"Dwarf Version", i32 5} + !1 = !{i32 2, !"Debug Info Version", i32 3} + !2 = !{!"clang version 21.0.0"} + !3 = distinct !DICompileUnit(language: DW_LANG_C, file: !4, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) + !4 = !DIFile(filename: "test.nodbg.ll", directory: "/") + !5 = !{i32 15} + !6 = !{i32 7} + !7 = distinct !DISubprogram(name: "g", linkageName: "g", scope: null, file: !4, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, retainedNodes: !10) + !8 = !DISubroutineType(types: !9) + !9 = !{} + !10 = !{!11} + !11 = !DILocalVariable(name: "1", scope: !7, file: !4, line: 3, type: !12) + !12 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) + !13 = !DILocation(line: 3, column: 1, scope: !7) + !14 = !DILocation(line: 9, column: 1, scope: !7) + !15 = !DILocation(line: 10, column: 1, scope: !7) + !16 = !DILocation(line: 11, column: 1, scope: !7) + !17 = !DILocation(line: 12, column: 1, scope: !7) +... +--- +name: g +tracksRegLiveness: true +isSSA: false +body: | + bb.0 (%ir-block.0): + successors: %bb.1(0x40000000), %bb.2(0x40000000) + + frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + CALL64pcrel32 @f, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + TEST64rr killed renamable $rax, renamable $rax, implicit-def $eflags + JCC_1 %bb.2, 9, implicit killed $eflags + JMP_1 %bb.1 + + bb.1.if.then: + successors: %bb.3(0x80000000) + + $edi = MOV32r0 implicit-def dead $eflags, debug-location !14 + DBG_VALUE $edi, $noreg, !11, !DIExpression(), debug-location !13 + + bb.3.if.end: + $eax = MOV32ri 2 + $rcx = frame-destroy POP64r implicit-def $rsp, implicit $rsp + frame-destroy CFI_INSTRUCTION def_cfa_offset 8 + RET 0, $eax + + bb.2.if.else: + successors: %bb.3(0x80000000) + + $edi = MOV32r0 implicit-def dead $eflags, debug-location !16 + DBG_VALUE $edi, $noreg, !11, !DIExpression(), debug-location !13 + CALL64pcrel32 target-flags(x86-plt) @_Z3barii, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit killed $edi, implicit-def $rsp, implicit-def $ssp, debug-location !16 + JMP_1 %bb.3, debug-location !15 + +... diff --git a/llvm/test/DebugInfo/X86/branch-folder-dbg.mir b/llvm/test/DebugInfo/X86/branch-folder-dbg.mir index 5c38fd2..7832598 100644 --- a/llvm/test/DebugInfo/X86/branch-folder-dbg.mir +++ b/llvm/test/DebugInfo/X86/branch-folder-dbg.mir @@ -1,16 +1,24 @@ -# RUN: llc %s --start-before=branch-folder --stop-after=branch-folder -o - | FileCheck %s +# RUN: llc %s --start-before=branch-folder --stop-after=branch-folder -o - \ +# RUN: | FileCheck %s --implicit-check-not=DBG_PHI ## Check that common instructions hoisted from `if.then` and `if.else` into -## common pred `entry` get merged debug locations. - -## FIXME: The debug instructions handling here is wrong. +## common pred `entry` get merged debug locations. The debug instructions from +## both branches should get hoisted and killed. +## +## The MIR debug instructions have been modified by hand in order to check they +## can be killed. +## +## Check DBG_PHIs are deleted rather than hoisted (implicit-check-not). # CHECK: bb.0 # CHECK: CALL64pcrel32 @f, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax -## --- Start splice from bb.2.if.else --- -# CHECK-NEXT: DBG_VALUE 2, $noreg, ![[#]], !DIExpression(), debug-location ![[#]] -# CHECK-NEXT: $edi = MOV32r0 implicit-def dead $eflags, debug-location !DILocation(line: 0, scope: ![[#]]) -## --- End splice -------------- +## --- Start splice from bb.2.if.else (and debug instructions from bb.1.if.then) --- +# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(), debug-location ![[#]] +# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(), debug-location ![[#]] +# CHECK-NEXT: $edi = MOV32r0 implicit-def dead $eflags, debug-instr-number 2, debug-location !DILocation(line: 0, scope: ![[#]]) +# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(DW_OP_LLVM_arg, 0), debug-location ![[#]] +# CHECK-NEXT: DBG_VALUE $noreg, $noreg, ![[#]], !DIExpression(DW_OP_LLVM_arg, 0), debug-location ![[#]] +## --- End splice ------------------------------------------------------------------ # CHECK-NEXT: TEST64rr killed renamable $rax, renamable $rax, implicit-def $eflags # CHECK-NEXT: JCC_1 %bb.2, 9, implicit killed $eflags # CHECK: bb.1 @@ -73,6 +81,8 @@ ... --- name: g +tracksRegLiveness: true +isSSA: false body: | bb.0 (%ir-block.0): successors: %bb.1(0x40000000), %bb.2(0x40000000) @@ -87,21 +97,23 @@ body: | bb.1.if.then: successors: %bb.3(0x80000000) - DBG_VALUE 0, $noreg, !11, !DIExpression(), debug-location !13 - $edi = MOV32r0 implicit-def dead $eflags, debug-location !14 + DBG_PHI $esp, 3 + DBG_VALUE $esi, $noreg, !11, !DIExpression(), debug-location !13 + $edi = MOV32r0 implicit-def dead $eflags, debug-instr-number 1, debug-location !14 + DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(1, 0), debug-location !13 $esi = MOV32r0 implicit-def dead $eflags, debug-location !14 CALL64pcrel32 target-flags(x86-plt) @_Z3fooii, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit killed $esi, implicit-def $rsp, implicit-def $ssp, debug-location !14 - DBG_VALUE 1, $noreg, !11, !DIExpression(), debug-location !13 JMP_1 %bb.3, debug-location !15 bb.2.if.else: successors: %bb.3(0x80000000) - DBG_VALUE 2, $noreg, !11, !DIExpression(), debug-location !13 - $edi = MOV32r0 implicit-def dead $eflags, debug-location !16 + DBG_PHI $esp, 4 + DBG_VALUE $esp, $noreg, !11, !DIExpression(), debug-location !13 + $edi = MOV32r0 implicit-def dead $eflags, debug-instr-number 2, debug-location !16 + DBG_INSTR_REF !11, !DIExpression(DW_OP_LLVM_arg, 0), dbg-instr-ref(2, 0), debug-location !13 $esi = MOV32ri 1, debug-location !16 CALL64pcrel32 target-flags(x86-plt) @_Z3barii, csr_64, implicit $rsp, implicit $ssp, implicit killed $edi, implicit killed $esi, implicit-def $rsp, implicit-def $ssp, debug-location !16 - DBG_VALUE 3, $noreg, !11, !DIExpression(), debug-location !13 bb.3.if.end: $eax = MOV32ri 2 diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s index b9eb2d2..c5288a7 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s @@ -249,6 +249,250 @@ flat_load_monitor_b32 v1, v[2:3] offset:64 // GFX1250: flat_load_monitor_b32 v1, v[2:3] offset:64 ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU +flat_load_monitor_b32 v1, v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: flat_load_monitor_b32 v1, v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1c,0xec,0x01,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b64 v[0:1], v[2:3] +// GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b64 v[0:1], v[2:3] offset:64 +// GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b64 v[0:1], v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: flat_load_monitor_b64 v[0:1], v[2:3] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1c,0xec,0x00,0x00,0x3c,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b128 v[0:3], v[4:5] +// GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b128 v[0:3], v[4:5] offset:64 +// GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] offset:64 ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b128 v[0:3], v[4:5] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: flat_load_monitor_b128 v[0:3], v[4:5] offset:-64 th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x1c,0xec,0x00,0x00,0x3c,0x00,0x04,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset +// GFX1250: flat_load_monitor_b32 v1, v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x00,0x1c,0xec,0x01,0x00,0x01,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset +// GFX1250: flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b8 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b8 v[2:3], v1, off offset:64 +// GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b8 v[2:3], v1, off offset:-64 +// GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b8 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b8 v2, v1, s[2:3] offset:64 +// GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b8 v2, v1, s[2:3] offset:-64 +// GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b32 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b32 v[2:3], v1, off offset:64 +// GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b32 v[2:3], v1, off offset:-64 +// GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b32 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b32 v2, v1, s[2:3] offset:64 +// GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b32 v2, v1, s[2:3] offset:-64 +// GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b64 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b64 v[2:3], v1, off offset:64 +// GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b64 v[2:3], v1, off offset:-64 +// GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b64 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b64 v2, v1, s[2:3] offset:64 +// GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b64 v2, v1, s[2:3] offset:-64 +// GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b128 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b128 v[2:3], v1, off offset:64 +// GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b128 v[2:3], v1, off offset:-64 +// GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b128 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b128 v2, v1, s[2:3] offset:64 +// GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b128 v2, v1, s[2:3] offset:-64 +// GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b32 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250: global_store_async_from_lds_b32 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250: global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b8 v1, v[2:3], off offset:64 +// GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b8 v1, v[2:3], off offset:-64 +// GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 +// GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 +// GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b32 v1, v[2:3], off offset:64 +// GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b32 v1, v[2:3], off offset:-64 +// GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 +// GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 +// GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b64 v1, v[2:3], off offset:64 +// GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b64 v1, v[2:3], off offset:-64 +// GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 +// GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 +// GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b128 v1, v[2:3], off offset:64 +// GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b128 v1, v[2:3], off offset:-64 +// GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 +// GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 +// GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b32 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: global_load_async_to_lds_b32 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +global_load_async_to_lds_b64 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250: global_load_async_to_lds_b64 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + tensor_save s[0:1] // GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s index 26d7ed3..c9fe702 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat_err.s @@ -57,3 +57,51 @@ scratch_load_b32 v5, off, off offset:32 scale_offset // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: scale_offset is not supported for this instruction // GFX1250-ERR-NEXT:{{^}}scratch_load_b32 v5, off, off offset:32 scale_offset // GFX1250-ERR-NEXT:{{^}} ^ + +global_store_async_from_lds_b8 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_store_async_from_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_store_async_from_lds_b32 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_store_async_from_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_store_async_from_lds_b64 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_store_async_from_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_store_async_from_lds_b128 v[2:3], v1, off th:TH_LOAD_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_store_async_from_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for store instructions + +global_load_async_to_lds_b8 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +global_load_async_to_lds_b32 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +global_load_async_to_lds_b64 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +global_load_async_to_lds_b128 v1, v[2:3], off th:TH_STORE_BYPASS scope:SCOPE_SYS +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions + +global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV +// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid th value for load instructions diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt index de7895f..291192b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt @@ -3177,6 +3177,162 @@ # GFX1250: global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00] 0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00 +# GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b128 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0x80,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0x80,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b128 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0x80,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0x00,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0x00,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b32 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0x00,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b64 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0x40,0x18,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0x40,0x18,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b64 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0x40,0x18,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x7c,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b8 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x17,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x17,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00] +0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff] +0x02,0xc0,0x17,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_load_async_to_lds_b8 v1, v2, s[2:3] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x17,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00] +0x02,0xc0,0x17,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b32 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00] +0x04,0x00,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00 + +# GFX1250: global_load_async_to_lds_b64 v2, v1, s[4:5] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00] +0x04,0x40,0x18,0xee,0x02,0x00,0x3d,0x00,0x01,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b128 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x80,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +0x7c,0x80,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x02,0x80,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b128 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x80,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +0x02,0x80,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b32 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +0x7c,0x00,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x02,0x00,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b32 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x00,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +0x02,0x00,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x7c,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b64 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +0x7c,0x40,0x19,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x02,0x40,0x19,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b64 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0x40,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +0x02,0x40,0x19,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off offset:-64 ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x7c,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b8 v[2:3], v1, off th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x18,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x18,0xee,0x00,0x00,0xbc,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00] +0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0x40,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] offset:-64 ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff] +0x02,0xc0,0x18,0xee,0x00,0x00,0x80,0x00,0x02,0xc0,0xff,0xff + +# GFX1250: global_store_async_from_lds_b8 v2, v1, s[2:3] th:TH_STORE_NT_HT scope:SCOPE_DEV ; encoding: [0x02,0xc0,0x18,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00] +0x02,0xc0,0x18,0xee,0x00,0x00,0xe8,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b32 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x00,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00] +0x04,0x00,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_async_from_lds_b64 v2, v1, s[4:5] scale_offset th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0x40,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00] +0x04,0x40,0x19,0xee,0x00,0x00,0xbd,0x00,0x02,0x00,0x00,0x00 + # GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] 0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 diff --git a/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll b/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll new file mode 100644 index 0000000..8303d6d --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof_func_assign_fix.ll @@ -0,0 +1,145 @@ +;; Make sure we assign the original callsite to a function clone (which will be +;; the original function clone), even when we cannot update its caller (due to +;; missing metadata e.g. from mismatched profiles). Otherwise we will try to use +;; the original function for a different clone, leading to confusion later when +;; rewriting the calls. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,A,plx \ +; RUN: -r=%t.o,B,plx \ +; RUN: -r=%t.o,C,plx \ +; RUN: -r=%t.o,D,plx \ +; RUN: -r=%t.o,E,plx \ +; RUN: -r=%t.o,F,plx \ +; RUN: -r=%t.o,G,plx \ +; RUN: -r=%t.o,A1,plx \ +; RUN: -r=%t.o,B1,plx \ +; RUN: -r=%t.o,_Znwm, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -debug-only=memprof-context-disambiguation \ +; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s \ +; RUN: --implicit-check-not="Mismatch in call clone assignment" \ +; RUN: --implicit-check-not="Number of callsites assigned to call multiple non-matching clones" + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + +; ModuleID = '<stdin>' +source_filename = "reduced.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; IR-LABEL: define dso_local void @A() +define void @A() #0 { + ; IR: call void @C() + call void @C() + ret void +} + +; IR-LABEL: define dso_local void @B() +define void @B() #0 { + ; IR: call void @C.memprof.1() + call void @C(), !callsite !1 + ret void +} + +; IR-LABEL: define dso_local void @C() +define void @C() #0 { + ; IR: call void @F() + call void @F(), !callsite !16 + ; IR: call void @D() + call void @D(), !callsite !2 + ret void +} + +; IR-LABEL: define dso_local void @D() +define void @D() #0 { + ; IR: call void @E() + call void @E(), !callsite !3 + ; IR: call void @G() + call void @G(), !callsite !17 + ret void +} + +; IR-LABEL: define dso_local void @E() +define void @E() #0 { + ; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD:[0-9]+]] + %1 = call ptr @_Znwm(i64 0), !memprof !4, !callsite !9 + ret void +} + +; IR-LABEL: define dso_local void @F() +define void @F() #0 { + ; IR: call void @G() + call void @G(), !callsite !17 + ret void +} + +; IR-LABEL: define dso_local void @G() +define void @G() #0 { + ; IR: call ptr @_Znwm(i64 0) #[[NOTCOLD]] + %2 = call ptr @_Znwm(i64 0), !memprof !10, !callsite !15 + ret void +} + +; IR-LABEL: define dso_local void @A1() +define void @A1() #0 { + ; IR: call void @C() + call void @C(), !callsite !18 + ret void +} + +; IR-LABEL: define dso_local void @B1() +define void @B1() #0 { + ; IR: call void @C.memprof.1() + call void @C(), !callsite !19 + ret void +} + +; IR-LABEL: define dso_local void @C.memprof.1() + ; IR: call void @F.memprof.1() + ; IR: call void @D.memprof.1() + +; IR-LABEL: define dso_local void @D.memprof.1() + ; IR: call void @E.memprof.1() + ; IR: call void @G() + +; IR-LABEL: define dso_local void @E.memprof.1() + ; IR: call ptr @_Znwm(i64 0) #[[COLD:[0-9]+]] + +; IR-LABEL: define dso_local void @F.memprof.1() + ; IR: call void @G.memprof.1() + +; IR-LABEL: define dso_local void @G.memprof.1() + ; IR: call ptr @_Znwm(i64 0) #[[COLD]] + +declare ptr @_Znwm(i64) + +attributes #0 = { noinline optnone } +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + +!0 = !{i64 123} +!1 = !{i64 234} +!2 = !{i64 345} +!3 = !{i64 456} +!4 = !{!5, !7} +!5 = !{!6, !"notcold"} +!6 = !{i64 567, i64 456, i64 345, i64 123} +!7 = !{!8, !"cold"} +!8 = !{i64 567, i64 456, i64 345, i64 234} +!9 = !{i64 567} +!10 = !{!11, !13} +!11 = !{!12, !"notcold"} +!12 = !{i64 678, i64 891, i64 789, i64 912} +!13 = !{!14, !"cold"} +!14 = !{i64 678, i64 891, i64 789, i64 812} +!15 = !{i64 678} +!16 = !{i64 789} +!17 = !{i64 891} +!18 = !{i64 912} +!19 = !{i64 812} diff --git a/llvm/test/Transforms/FunctionAttrs/noalias.ll b/llvm/test/Transforms/FunctionAttrs/noalias.ll new file mode 100644 index 0000000..8beb6fe --- /dev/null +++ b/llvm/test/Transforms/FunctionAttrs/noalias.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=function-attrs < %s | FileCheck %s + +declare noalias ptr @malloc(i64 %size) +declare ptr @not_malloc(i64 %size) +declare void @capture(ptr) + +@g = external global i8 + +define ptr @return_malloc(i64 %size) { +; CHECK-LABEL: define noalias ptr @return_malloc( +; CHECK-SAME: i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: ret ptr [[A]] +; + %a = call ptr @malloc(i64 %size) + ret ptr %a +} + +define ptr @return_not_malloc(i64 %size) { +; CHECK-LABEL: define ptr @return_not_malloc( +; CHECK-SAME: i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call ptr @not_malloc(i64 [[SIZE]]) +; CHECK-NEXT: ret ptr [[A]] +; + %a = call ptr @not_malloc(i64 %size) + ret ptr %a +} + +define ptr @return_null() { +; CHECK-LABEL: define noalias noundef ptr @return_null( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: ret ptr null +; + ret ptr null +} + +define ptr @return_poison() { +; CHECK-LABEL: define noalias ptr @return_poison( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: ret ptr poison +; + ret ptr poison +} + +define ptr @return_alloca() { +; CHECK-LABEL: define noalias noundef nonnull ptr @return_alloca( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1 +; CHECK-NEXT: ret ptr [[A]] +; + %a = alloca i8 + ret ptr %a +} + +; noalias arg does not imply noalias return +define ptr @return_noalias_arg(ptr noalias %arg) { +; CHECK-LABEL: define ptr @return_noalias_arg( +; CHECK-SAME: ptr noalias readnone returned captures(ret: address, provenance) [[ARG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret ptr [[ARG]] +; + ret ptr %arg +} + +define ptr @return_global() { +; CHECK-LABEL: define noundef nonnull ptr @return_global( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: ret ptr @g +; + ret ptr @g +} + +define ptr @no_return() { +; CHECK-LABEL: define noalias noundef nonnull ptr @no_return( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: unreachable +; + unreachable +} + +define ptr @return_multiple(i1 %c, i64 %size) { +; CHECK-LABEL: define noalias ptr @return_multiple( +; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: ret ptr [[A]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[B:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: ret ptr [[B]] +; +br i1 %c, label %if, label %else + +if: + %a = call ptr @malloc(i64 %size) + ret ptr %a + +else: + %b = call ptr @malloc(i64 %size) + ret ptr %b +} + +define ptr @return_select(i1 %c, i64 %size) { +; CHECK-LABEL: define noalias ptr @return_select( +; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C]], ptr [[A]], ptr null +; CHECK-NEXT: ret ptr [[SEL]] +; + %a = call ptr @malloc(i64 %size) + %sel = select i1 %c, ptr %a, ptr null + ret ptr %sel +} + +define ptr @return_phi(i1 %c, i64 %size) { +; CHECK-LABEL: define noalias ptr @return_phi( +; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: br label %[[JOIN:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[A]], %[[IF]] ], [ null, %[[ELSE]] ] +; CHECK-NEXT: ret ptr [[PHI]] +; + br i1 %c, label %if, label %else + +if: + %a = call ptr @malloc(i64 %size) + br label %join + +else: + br label %join + +join: + %phi = phi ptr [ %a, %if ], [ null, %else ] + ret ptr %phi +} + +define ptr @return_phi_wrong(i1 %c, i64 %size) { +; CHECK-LABEL: define ptr @return_phi_wrong( +; CHECK-SAME: i1 [[C:%.*]], i64 [[SIZE:%.*]]) { +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: br label %[[JOIN:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[B:%.*]] = call ptr @not_malloc(i64 [[SIZE]]) +; CHECK-NEXT: br label %[[JOIN]] +; CHECK: [[JOIN]]: +; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[A]], %[[IF]] ], [ [[B]], %[[ELSE]] ] +; CHECK-NEXT: ret ptr [[PHI]] +; + br i1 %c, label %if, label %else + +if: + %a = call ptr @malloc(i64 %size) + br label %join + +else: + %b = call ptr @not_malloc(i64 %size) + br label %join + +join: + %phi = phi ptr [ %a, %if ], [ %b, %else ] + ret ptr %phi +} + +define ptr @return_malloc_with_store(i64 %size) { +; CHECK-LABEL: define noalias noundef ptr @return_malloc_with_store( +; CHECK-SAME: i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: store i8 0, ptr [[A]], align 1 +; CHECK-NEXT: ret ptr [[A]] +; + %a = call ptr @malloc(i64 %size) + store i8 0, ptr %a + ret ptr %a +} + +define ptr @return_malloc_captured(i64 %size) { +; CHECK-LABEL: define ptr @return_malloc_captured( +; CHECK-SAME: i64 [[SIZE:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 [[SIZE]]) +; CHECK-NEXT: call void @capture(ptr [[A]]) +; CHECK-NEXT: ret ptr [[A]] +; + %a = call ptr @malloc(i64 %size) + call void @capture(ptr %a) + ret ptr %a +} + +define ptr @scc1(i1 %c) { +; CHECK-LABEL: define noalias ptr @scc1( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[ELSE:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: [[A:%.*]] = call ptr @malloc(i64 4) +; CHECK-NEXT: ret ptr [[A]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[B:%.*]] = call ptr @scc2(i1 [[C]]) +; CHECK-NEXT: ret ptr [[B]] +; + br i1 %c, label %if, label %else + +if: + %a = call ptr @malloc(i64 4) + ret ptr %a + +else: + %b = call ptr @scc2(i1 %c) + ret ptr %b +} + +define ptr @scc2(i1 %c) { +; CHECK-LABEL: define noalias ptr @scc2( +; CHECK-SAME: i1 [[C:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call ptr @scc1(i1 [[C]]) +; CHECK-NEXT: ret ptr [[A]] +; + %a = call ptr @scc1(i1 %c) + ret ptr %a +} + +define ptr @return_unknown_call(ptr %fn) { +; CHECK-LABEL: define ptr @return_unknown_call( +; CHECK-SAME: ptr readonly captures(none) [[FN:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call ptr [[FN]]() +; CHECK-NEXT: ret ptr [[A]] +; + %a = call ptr %fn() + ret ptr %a +} + +define ptr @return_unknown_noalias_call(ptr %fn) { +; CHECK-LABEL: define ptr @return_unknown_noalias_call( +; CHECK-SAME: ptr readonly captures(none) [[FN:%.*]]) { +; CHECK-NEXT: [[A:%.*]] = call noalias ptr [[FN]]() +; CHECK-NEXT: ret ptr [[A]] +; + %a = call noalias ptr %fn() + ret ptr %a +} diff --git a/llvm/test/Transforms/FunctionAttrs/nofree.ll b/llvm/test/Transforms/FunctionAttrs/nofree.ll index 1671189..89f030d 100644 --- a/llvm/test/Transforms/FunctionAttrs/nofree.ll +++ b/llvm/test/Transforms/FunctionAttrs/nofree.ll @@ -156,6 +156,24 @@ entry: ret void } +define void @unknown_call(ptr %fn) { +; CHECK-LABEL: @unknown_call( +; CHECK-NEXT: call void [[FN:%.*]]() +; CHECK-NEXT: ret void +; + call void %fn() + ret void +} + +define void @unknown_nofree_call(ptr %fn) { +; CHECK-LABEL: @unknown_nofree_call( +; CHECK-NEXT: call void [[FN:%.*]]() #[[ATTR5]] +; CHECK-NEXT: ret void +; + call void %fn() nofree + ret void +} + declare void @_ZdaPv(ptr) local_unnamed_addr #4 attributes #0 = { uwtable } diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll index 483b560..9b17ded 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll @@ -1396,5 +1396,35 @@ define ptr @pr91177_non_inbounds_gep(ptr nonnull %arg) { ret ptr %res } +define ptr @unknown_func(ptr %fn) { +; FNATTRS-LABEL: define ptr @unknown_func( +; FNATTRS-SAME: ptr readonly captures(none) [[FN:%.*]]) { +; FNATTRS-NEXT: [[RES:%.*]] = call ptr [[FN]]() +; FNATTRS-NEXT: ret ptr [[RES]] +; +; ATTRIBUTOR-LABEL: define ptr @unknown_func( +; ATTRIBUTOR-SAME: ptr nofree nonnull captures(none) [[FN:%.*]]) { +; ATTRIBUTOR-NEXT: [[RES:%.*]] = call ptr [[FN]]() +; ATTRIBUTOR-NEXT: ret ptr [[RES]] +; + %res = call ptr %fn() + ret ptr %res +} + +define ptr @unknown_nonnull_func(ptr %fn) { +; FNATTRS-LABEL: define ptr @unknown_nonnull_func( +; FNATTRS-SAME: ptr readonly captures(none) [[FN:%.*]]) { +; FNATTRS-NEXT: [[RES:%.*]] = call nonnull ptr [[FN]]() +; FNATTRS-NEXT: ret ptr [[RES]] +; +; ATTRIBUTOR-LABEL: define nonnull ptr @unknown_nonnull_func( +; ATTRIBUTOR-SAME: ptr nofree nonnull captures(none) [[FN:%.*]]) { +; ATTRIBUTOR-NEXT: [[RES:%.*]] = call nonnull ptr [[FN]]() +; ATTRIBUTOR-NEXT: ret ptr [[RES]] +; + %res = call nonnull ptr %fn() + ret ptr %res +} + attributes #0 = { null_pointer_is_valid } attributes #1 = { nounwind willreturn} diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll index 7a089f6..5cb8ac0 100644 --- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll +++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll @@ -241,6 +241,37 @@ define void @r() norecurse { call void @q() ret void } + +define void @unknown_call(ptr %fn) { +; FNATTRS-LABEL: define {{[^@]+}}@unknown_call +; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) { +; FNATTRS-NEXT: call void [[FN]]() +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_call +; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) { +; ATTRIBUTOR-NEXT: call void [[FN]]() +; ATTRIBUTOR-NEXT: ret void +; + call void %fn() + ret void +} + +define void @unknown_norecurse_call(ptr %fn) { +; FNATTRS-LABEL: define {{[^@]+}}@unknown_norecurse_call +; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) { +; FNATTRS-NEXT: call void [[FN]]() #[[ATTR7:[0-9]+]] +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_norecurse_call +; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) { +; ATTRIBUTOR-NEXT: call void [[FN]]() #[[ATTR9:[0-9]+]] +; ATTRIBUTOR-NEXT: ret void +; + call void %fn() norecurse + ret void +} + ;. ; FNATTRS: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; FNATTRS: attributes #[[ATTR1]] = { nofree nosync nounwind memory(none) } @@ -249,6 +280,7 @@ define void @r() norecurse { ; FNATTRS: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) } ; FNATTRS: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } ; FNATTRS: attributes #[[ATTR6]] = { nofree norecurse nosync memory(none) } +; FNATTRS: attributes #[[ATTR7]] = { norecurse } ;. ; ATTRIBUTOR: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) } ; ATTRIBUTOR: attributes #[[ATTR1]] = { nofree nosync nounwind memory(none) } @@ -259,6 +291,7 @@ define void @r() norecurse { ; ATTRIBUTOR: attributes #[[ATTR6]] = { norecurse nosync memory(none) } ; ATTRIBUTOR: attributes #[[ATTR7]] = { nosync } ; ATTRIBUTOR: attributes #[[ATTR8]] = { nofree willreturn } +; ATTRIBUTOR: attributes #[[ATTR9]] = { norecurse } ;. ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; COMMON: {{.*}} diff --git a/llvm/test/Transforms/FunctionAttrs/nounwind.ll b/llvm/test/Transforms/FunctionAttrs/nounwind.ll index afa9ae3..a64d9a6 100644 --- a/llvm/test/Transforms/FunctionAttrs/nounwind.ll +++ b/llvm/test/Transforms/FunctionAttrs/nounwind.ll @@ -4,10 +4,15 @@ ; TEST 1 define i32 @foo1() { -; COMMON: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; COMMON-LABEL: define {{[^@]+}}@foo1 -; COMMON-SAME: () #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: ret i32 1 +; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; FNATTRS-LABEL: define {{[^@]+}}@foo1 +; FNATTRS-SAME: () #[[ATTR0:[0-9]+]] { +; FNATTRS-NEXT: ret i32 1 +; +; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) +; ATTRIBUTOR-LABEL: define {{[^@]+}}@foo1 +; ATTRIBUTOR-SAME: () #[[ATTR0:[0-9]+]] { +; ATTRIBUTOR-NEXT: ret i32 1 ; ret i32 1 } @@ -70,14 +75,23 @@ define void @call_non_nounwind(){ ; } define i32 @maybe_throw(i1 zeroext %0) { -; COMMON-LABEL: define {{[^@]+}}@maybe_throw -; COMMON-SAME: (i1 zeroext [[TMP0:%.*]]) { -; COMMON-NEXT: br i1 [[TMP0]], label [[TMP2:%.*]], label [[TMP3:%.*]] -; COMMON: 2: -; COMMON-NEXT: tail call void @__cxa_rethrow() -; COMMON-NEXT: unreachable -; COMMON: 3: -; COMMON-NEXT: ret i32 -1 +; FNATTRS-LABEL: define {{[^@]+}}@maybe_throw +; FNATTRS-SAME: (i1 zeroext [[TMP0:%.*]]) { +; FNATTRS-NEXT: br i1 [[TMP0]], label [[TMP2:%.*]], label [[TMP3:%.*]] +; FNATTRS: 2: +; FNATTRS-NEXT: tail call void @__cxa_rethrow() +; FNATTRS-NEXT: unreachable +; FNATTRS: 3: +; FNATTRS-NEXT: ret i32 -1 +; +; ATTRIBUTOR-LABEL: define {{[^@]+}}@maybe_throw +; ATTRIBUTOR-SAME: (i1 zeroext [[TMP0:%.*]]) { +; ATTRIBUTOR-NEXT: br i1 [[TMP0]], label [[TMP2:%.*]], label [[TMP3:%.*]] +; ATTRIBUTOR: 2: +; ATTRIBUTOR-NEXT: tail call void @__cxa_rethrow() +; ATTRIBUTOR-NEXT: unreachable +; ATTRIBUTOR: 3: +; ATTRIBUTOR-NEXT: ret i32 -1 ; br i1 %0, label %2, label %3 @@ -101,18 +115,31 @@ declare void @__cxa_rethrow() ; } define i32 @catch_thing() personality ptr @__gxx_personality_v0 { -; COMMON-LABEL: define {{[^@]+}}@catch_thing() personality ptr @__gxx_personality_v0 { -; COMMON-NEXT: invoke void @__cxa_rethrow() -; COMMON-NEXT: to label [[TMP1:%.*]] unwind label [[TMP2:%.*]] -; COMMON: 1: -; COMMON-NEXT: unreachable -; COMMON: 2: -; COMMON-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } -; COMMON-NEXT: catch ptr null -; COMMON-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0 -; COMMON-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]]) -; COMMON-NEXT: tail call void @__cxa_end_catch() -; COMMON-NEXT: ret i32 -1 +; FNATTRS-LABEL: define {{[^@]+}}@catch_thing() personality ptr @__gxx_personality_v0 { +; FNATTRS-NEXT: invoke void @__cxa_rethrow() +; FNATTRS-NEXT: to label [[TMP1:%.*]] unwind label [[TMP2:%.*]] +; FNATTRS: 1: +; FNATTRS-NEXT: unreachable +; FNATTRS: 2: +; FNATTRS-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; FNATTRS-NEXT: catch ptr null +; FNATTRS-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0 +; FNATTRS-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]]) +; FNATTRS-NEXT: tail call void @__cxa_end_catch() +; FNATTRS-NEXT: ret i32 -1 +; +; ATTRIBUTOR-LABEL: define {{[^@]+}}@catch_thing() personality ptr @__gxx_personality_v0 { +; ATTRIBUTOR-NEXT: invoke void @__cxa_rethrow() +; ATTRIBUTOR-NEXT: to label [[TMP1:%.*]] unwind label [[TMP2:%.*]] +; ATTRIBUTOR: 1: +; ATTRIBUTOR-NEXT: unreachable +; ATTRIBUTOR: 2: +; ATTRIBUTOR-NEXT: [[TMP3:%.*]] = landingpad { ptr, i32 } +; ATTRIBUTOR-NEXT: catch ptr null +; ATTRIBUTOR-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP3]], 0 +; ATTRIBUTOR-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]]) +; ATTRIBUTOR-NEXT: tail call void @__cxa_end_catch() +; ATTRIBUTOR-NEXT: ret i32 -1 ; invoke void @__cxa_rethrow() #1 to label %1 unwind label %2 @@ -130,9 +157,13 @@ define i32 @catch_thing() personality ptr @__gxx_personality_v0 { } define i32 @catch_thing_user() { -; COMMON-LABEL: define {{[^@]+}}@catch_thing_user() { -; COMMON-NEXT: [[CATCH_THING_CALL:%.*]] = call i32 @catch_thing() -; COMMON-NEXT: ret i32 [[CATCH_THING_CALL]] +; FNATTRS-LABEL: define {{[^@]+}}@catch_thing_user() { +; FNATTRS-NEXT: [[CATCH_THING_CALL:%.*]] = call i32 @catch_thing() +; FNATTRS-NEXT: ret i32 [[CATCH_THING_CALL]] +; +; ATTRIBUTOR-LABEL: define {{[^@]+}}@catch_thing_user() { +; ATTRIBUTOR-NEXT: [[CATCH_THING_CALL:%.*]] = call i32 @catch_thing() +; ATTRIBUTOR-NEXT: ret i32 [[CATCH_THING_CALL]] ; %catch_thing_call = call i32 @catch_thing() ret i32 %catch_thing_call @@ -147,10 +178,10 @@ define void @catch_specific_landingpad() personality ptr @__gxx_personality_v0 { ; COMMON-LABEL: define {{[^@]+}}@catch_specific_landingpad ; COMMON-SAME: () #[[ATTR3:[0-9]+]] personality ptr @__gxx_personality_v0 { ; COMMON-NEXT: invoke void @do_throw() -; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] +; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] ; COMMON: lpad: ; COMMON-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } -; COMMON-NEXT: catch ptr @catch_ty +; COMMON-NEXT: catch ptr @catch_ty ; COMMON-NEXT: call void @abort() ; COMMON-NEXT: unreachable ; COMMON: unreachable: @@ -174,10 +205,10 @@ define void @catch_all_landingpad() personality ptr @__gxx_personality_v0 { ; COMMON-LABEL: define {{[^@]+}}@catch_all_landingpad ; COMMON-SAME: () #[[ATTR4:[0-9]+]] personality ptr @__gxx_personality_v0 { ; COMMON-NEXT: invoke void @do_throw() -; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] +; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] ; COMMON: lpad: ; COMMON-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } -; COMMON-NEXT: catch ptr null +; COMMON-NEXT: catch ptr null ; COMMON-NEXT: call void @abort() ; COMMON-NEXT: unreachable ; COMMON: unreachable: @@ -201,10 +232,10 @@ define void @filter_specific_landingpad() personality ptr @__gxx_personality_v0 ; COMMON-LABEL: define {{[^@]+}}@filter_specific_landingpad ; COMMON-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { ; COMMON-NEXT: invoke void @do_throw() -; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] +; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] ; COMMON: lpad: ; COMMON-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } -; COMMON-NEXT: filter [1 x ptr] [ptr @catch_ty] +; COMMON-NEXT: filter [1 x ptr] [ptr @catch_ty] ; COMMON-NEXT: call void @abort() ; COMMON-NEXT: unreachable ; COMMON: unreachable: @@ -228,10 +259,10 @@ define void @filter_none_landingpad() personality ptr @__gxx_personality_v0 { ; COMMON-LABEL: define {{[^@]+}}@filter_none_landingpad ; COMMON-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 { ; COMMON-NEXT: invoke void @do_throw() -; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] +; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] ; COMMON: lpad: ; COMMON-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } -; COMMON-NEXT: filter [0 x ptr] zeroinitializer +; COMMON-NEXT: filter [0 x ptr] zeroinitializer ; COMMON-NEXT: call void @abort() ; COMMON-NEXT: unreachable ; COMMON: unreachable: @@ -255,10 +286,10 @@ define void @cleanup_landingpad() personality ptr @__gxx_personality_v0 { ; COMMON-LABEL: define {{[^@]+}}@cleanup_landingpad ; COMMON-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { ; COMMON-NEXT: invoke void @do_throw() -; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] +; COMMON-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[LPAD:%.*]] ; COMMON: lpad: ; COMMON-NEXT: [[LP:%.*]] = landingpad { ptr, i32 } -; COMMON-NEXT: cleanup +; COMMON-NEXT: cleanup ; COMMON-NEXT: call void @abort() ; COMMON-NEXT: unreachable ; COMMON: unreachable: @@ -282,7 +313,7 @@ define void @cleanuppad() personality ptr @__gxx_personality_v0 { ; FNATTRS-LABEL: define {{[^@]+}}@cleanuppad ; FNATTRS-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { ; FNATTRS-NEXT: invoke void @do_throw() -; FNATTRS-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]] +; FNATTRS-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]] ; FNATTRS: cpad: ; FNATTRS-NEXT: [[CP:%.*]] = cleanuppad within none [] ; FNATTRS-NEXT: call void @abort() @@ -294,7 +325,7 @@ define void @cleanuppad() personality ptr @__gxx_personality_v0 { ; ATTRIBUTOR-LABEL: define {{[^@]+}}@cleanuppad ; ATTRIBUTOR-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 { ; ATTRIBUTOR-NEXT: invoke void @do_throw() -; ATTRIBUTOR-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]] +; ATTRIBUTOR-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CPAD:%.*]] ; ATTRIBUTOR: cpad: ; ATTRIBUTOR-NEXT: [[CP:%.*]] = cleanuppad within none [] ; ATTRIBUTOR-NEXT: call void @abort() @@ -319,7 +350,7 @@ define void @catchswitch_cleanuppad() personality ptr @__gxx_personality_v0 { ; FNATTRS-LABEL: define {{[^@]+}}@catchswitch_cleanuppad ; FNATTRS-SAME: () #[[ATTR3]] personality ptr @__gxx_personality_v0 { ; FNATTRS-NEXT: invoke void @do_throw() -; FNATTRS-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]] +; FNATTRS-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]] ; FNATTRS: cs: ; FNATTRS-NEXT: [[TOK:%.*]] = catchswitch within none [label %catch] unwind label [[CPAD:%.*]] ; FNATTRS: catch: @@ -337,7 +368,7 @@ define void @catchswitch_cleanuppad() personality ptr @__gxx_personality_v0 { ; ATTRIBUTOR-LABEL: define {{[^@]+}}@catchswitch_cleanuppad ; ATTRIBUTOR-SAME: () #[[ATTR4]] personality ptr @__gxx_personality_v0 { ; ATTRIBUTOR-NEXT: invoke void @do_throw() -; ATTRIBUTOR-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]] +; ATTRIBUTOR-NEXT: to label [[UNREACHABLE:%.*]] unwind label [[CS:%.*]] ; ATTRIBUTOR: cs: ; ATTRIBUTOR-NEXT: [[TOK:%.*]] = catchswitch within none [label %catch] unwind label [[CPAD:%.*]] ; ATTRIBUTOR: catch: @@ -371,6 +402,37 @@ unreachable: unreachable } +define void @unknown_call(ptr %fn) { +; FNATTRS-LABEL: define {{[^@]+}}@unknown_call +; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) { +; FNATTRS-NEXT: call void [[FN]]() +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_call +; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) { +; ATTRIBUTOR-NEXT: call void [[FN]]() +; ATTRIBUTOR-NEXT: ret void +; + call void %fn() + ret void +} + +define void @unknown_nounwind_call(ptr %fn) { +; FNATTRS-LABEL: define {{[^@]+}}@unknown_nounwind_call +; FNATTRS-SAME: (ptr readonly captures(none) [[FN:%.*]]) { +; FNATTRS-NEXT: call void [[FN]]() #[[ATTR2:[0-9]+]] +; FNATTRS-NEXT: ret void +; +; ATTRIBUTOR: Function Attrs: nounwind +; ATTRIBUTOR-LABEL: define {{[^@]+}}@unknown_nounwind_call +; ATTRIBUTOR-SAME: (ptr nofree nonnull captures(none) [[FN:%.*]]) #[[ATTR2:[0-9]+]] { +; ATTRIBUTOR-NEXT: call void [[FN]]() #[[ATTR2]] +; ATTRIBUTOR-NEXT: ret void +; + call void %fn() nounwind + ret void +} + declare i32 @__gxx_personality_v0(...) declare ptr @__cxa_begin_catch(ptr) diff --git a/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll b/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll index 4d5db32..04575e4 100644 --- a/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll +++ b/llvm/test/Transforms/FunctionAttrs/sendmsg-nocallback.ll @@ -50,10 +50,12 @@ define internal i32 @sendmsg_rtn_is_norecurse() { } define void @user() { -; FNATTRS-LABEL: define void @user() { +; FNATTRS: Function Attrs: norecurse nounwind +; FNATTRS-LABEL: define void @user( +; FNATTRS-SAME: ) #[[ATTR1]] { ; FNATTRS-NEXT: call void @sendmsg_is_norecurse() ; FNATTRS-NEXT: call void @sendmsghalt_is_norecurse() -; FNATTRS-NEXT: call void @sendmsg_rtn_is_norecurse() +; FNATTRS-NEXT: [[TMP1:%.*]] = call i32 @sendmsg_rtn_is_norecurse() ; FNATTRS-NEXT: ret void ; ; ATTRIBUTOR: Function Attrs: norecurse nounwind @@ -61,12 +63,12 @@ define void @user() { ; ATTRIBUTOR-SAME: ) #[[ATTR1]] { ; ATTRIBUTOR-NEXT: call void @sendmsg_is_norecurse() #[[ATTR5:[0-9]+]] ; ATTRIBUTOR-NEXT: call void @sendmsghalt_is_norecurse() #[[ATTR6:[0-9]+]] -; ATTRIBUTOR-NEXT: call void @sendmsg_rtn_is_norecurse() #[[ATTR6]] +; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = call i32 @sendmsg_rtn_is_norecurse() #[[ATTR6]] ; ATTRIBUTOR-NEXT: ret void ; call void @sendmsg_is_norecurse() call void @sendmsghalt_is_norecurse() - call void @sendmsg_rtn_is_norecurse() + call i32 @sendmsg_rtn_is_norecurse() ret void } ;. diff --git a/llvm/test/Transforms/HipStdPar/math-fixup.ll b/llvm/test/Transforms/HipStdPar/math-fixup.ll new file mode 100644 index 0000000..2c4622c --- /dev/null +++ b/llvm/test/Transforms/HipStdPar/math-fixup.ll @@ -0,0 +1,548 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=hipstdpar-math-fixup %s | FileCheck %s + +define void @test_acos(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_acos( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_acos_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_acos_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.acos.f64(double %dbl) + %1 = call float @llvm.acos.f32(float %flt) + ret void +} + +define void @test_acosh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_acosh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_acosh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_acosh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @acosh(double %dbl) + %1 = call float @acoshf(float %flt) + ret void +} + +define void @test_asin(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_asin( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_asin_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_asin_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.asin.f64(double %dbl) + %1 = call float @llvm.asin.f32(float %flt) + ret void +} + +define void @test_asinh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_asinh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_asinh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_asinh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @asinh(double %dbl) + %1 = call float @asinhf(float %flt) + ret void +} + +define void @test_atan(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_atan( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_atan_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_atan_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.atan.f64(double %dbl) + %1 = call float @llvm.atan.f32(float %flt) + ret void +} + +define void @test_atanh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_atanh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_atanh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_atanh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @atanh(double %dbl) + %1 = call float @atanhf(float %flt) + ret void +} + +define void @test_atan2(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_atan2( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_atan2_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_atan2_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.atan2.f64(double %dbl, double %dbl) + %1 = call float @llvm.atan2.f32(float %flt, float %flt) + ret void +} + +define void @test_cbrt(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_cbrt( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_cbrt_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_cbrt_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @cbrt(double %dbl) + %1 = call float @cbrtf(float %flt) + ret void +} + +define void @test_cos(double %dbl) { +; CHECK-LABEL: define void @test_cos( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_cos_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.cos.f64(double %dbl) + ret void +} + +define void @test_cosh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_cosh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_cosh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_cosh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.cosh.f64(double %dbl) + %1 = call float @llvm.cosh.f32(float %flt) + ret void +} + +define void @test_erf(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_erf( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_erf_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_erf_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @erf(double %dbl) + %1 = call float @erff(float %flt) + ret void +} + +define void @test_erfc(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_erfc( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_erfc_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_erfc_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @erfc(double %dbl) + %1 = call float @erfcf(float %flt) + ret void +} + +define void @test_exp(double %dbl) { +; CHECK-LABEL: define void @test_exp( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_exp_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.exp.f64(double %dbl) + ret void +} + +define void @test_exp2(double %dbl) { +; CHECK-LABEL: define void @test_exp2( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_exp2_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.exp2.f64(double %dbl) + ret void +} + +define void @test_expm1(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_expm1( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_expm1_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_expm1_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @expm1(double %dbl) + %1 = call float @expm1f(float %flt) + ret void +} + +define void @test_fdim(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_fdim( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_fdim_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_fdim_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @fdim(double %dbl, double %dbl) + %1 = call float @fdimf(float %flt, float %flt) + ret void +} + +define void @test_hypot(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_hypot( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_hypot_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_hypot_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @hypot(double %dbl, double %dbl) + %1 = call float @hypotf(float %flt, float %flt) + ret void +} + +define void @test_lgamma(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_lgamma( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_lgamma_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_lgamma_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @lgamma(double %dbl) + %1 = call float @lgammaf(float %flt) + ret void +} + +define void @test_log(double %dbl) { +; CHECK-LABEL: define void @test_log( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.log.f64(double %dbl) + ret void +} + +define void @test_log10(double %dbl) { +; CHECK-LABEL: define void @test_log10( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log10_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.log10.f64(double %dbl) + ret void +} + +define void @test_log2(double %dbl) { +; CHECK-LABEL: define void @test_log2( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log2_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.log2.f64(double %dbl) + ret void +} + +define void @test_log1p(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_log1p( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_log1p_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_log1p_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @log1p(double %dbl) + %1 = call float @log1pf(float %flt) + ret void +} + +define void @test_modf(double %dbl, float %flt, ptr %pdbl, ptr %pflt) { +; CHECK-LABEL: define void @test_modf( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]], ptr [[PDBL:%.*]], ptr [[PFLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call { double, double } @__hipstdpar_modf_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { double, double } [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, double } [[TMP0]], 1 +; CHECK-NEXT: store double [[TMP2]], ptr [[PDBL]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = tail call { float, float } @__hipstdpar_modf_f32(float [[FLT]]) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { float, float } [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { float, float } [[TMP3]], 1 +; CHECK-NEXT: store float [[TMP5]], ptr [[PFLT]], align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = tail call { double, double } @llvm.modf.f64(double %dbl) + %1 = extractvalue { double, double } %0, 0 + %2 = extractvalue { double, double } %0, 1 + store double %2, ptr %pdbl, align 8 + %3 = tail call { float, float } @llvm.modf.f32(float %flt) + %4 = extractvalue { float, float } %3, 0 + %5 = extractvalue { float, float } %3, 1 + store float %5, ptr %pflt, align 4 + ret void +} + +define void @test_pow(double %dbl) { +; CHECK-LABEL: define void @test_pow( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_pow_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.pow.f64(double %dbl, double %dbl) + ret void +} + +define void @test_remainder(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_remainder( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_remainder_f64(double [[DBL]], double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_remainder_f32(float [[FLT]], float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @remainder(double %dbl, double %dbl) + %1 = call float @remainderf(float %flt, float %flt) + ret void +} + +define void @test_remquo(double %dbl, float %flt, ptr %p) { +; CHECK-LABEL: define void @test_remquo( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_remquo_f64(double [[DBL]], double [[DBL]], ptr [[P]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_remquo_f32(float [[FLT]], float [[FLT]], ptr [[P]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @remquo(double %dbl, double %dbl, ptr %p) + %1 = call float @remquof(float %flt, float %flt, ptr %p) + ret void +} + +define void @test_sin(double %dbl) { +; CHECK-LABEL: define void @test_sin( +; CHECK-SAME: double [[DBL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_sin_f64(double [[DBL]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.sin.f64(double %dbl) + ret void +} + +define void @test_sinh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_sinh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_sinh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_sinh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.sinh.f64(double %dbl) + %1 = call float @llvm.sinh.f32(float %flt) + ret void +} + +define void @test_tan(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_tan( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_tan_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_tan_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.tan.f64(double %dbl) + %1 = call float @llvm.tan.f32(float %flt) + ret void +} + +define void @test_tanh(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_tanh( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_tanh_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_tanh_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @llvm.tanh.f64(double %dbl) + %1 = call float @llvm.tanh.f32(float %flt) + ret void +} + +define void @test_tgamma(double %dbl, float %flt) { +; CHECK-LABEL: define void @test_tgamma( +; CHECK-SAME: double [[DBL:%.*]], float [[FLT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_tgamma_f64(double [[DBL]]) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_tgamma_f32(float [[FLT]]) +; CHECK-NEXT: ret void +; +entry: + %0 = call double @tgamma(double %dbl) + %1 = call float @tgammaf(float %flt) + ret void +} + +@globdbl = global double 4.200000e+01 +@globflt = global float 4.200000e+01 + +define void @global_args() { +; CHECK-LABEL: define void @global_args() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DBL:%.*]] = load double, ptr @globdbl, align 8 +; CHECK-NEXT: [[FLT:%.*]] = load float, ptr @globflt, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call double @__hipstdpar_remquo_f64(double [[DBL]], double [[DBL]], ptr @globdbl) +; CHECK-NEXT: [[TMP1:%.*]] = call float @__hipstdpar_remquo_f32(float [[FLT]], float [[FLT]], ptr @globflt) +; CHECK-NEXT: ret void +; +entry: + %dbl = load double, ptr @globdbl + %flt = load float, ptr @globflt + %1 = call double @remquo(double %dbl, double %dbl, ptr @globdbl) + %2 = call float @remquof(float %flt, float %flt, ptr @globflt) + ret void +} + +declare hidden double @remainder(double, double) + +declare hidden float @remainderf(float, float) + +declare hidden double @remquo(double, double, ptr) + +declare hidden float @remquof(float, float, ptr) + +declare hidden double @fdim(double, double) + +declare hidden float @fdimf(float, float) + +declare double @llvm.exp.f64(double) + +declare float @llvm.exp.f32(float) + +declare double @llvm.exp2.f64(double) + +declare float @llvm.exp2.f32(float) + +declare hidden double @expm1(double) + +declare hidden float @expm1f(float) + +declare double @llvm.log.f64(double) + +declare double @llvm.log10.f64(double) + +declare double @llvm.log2.f64(double) + +declare hidden double @log1p(double) + +declare hidden float @log1pf(float) + +declare { float, float } @llvm.modf.f32(float) + +declare { double, double } @llvm.modf.f64(double) + +declare double @llvm.pow.f64(double, double) + +declare hidden double @cbrt(double) + +declare hidden float @cbrtf(float) + +declare hidden double @hypot(double, double) + +declare hidden float @hypotf(float, float) + +declare double @llvm.sin.f64(double) + +declare double @llvm.cos.f64(double) + +declare double @llvm.tan.f64(double) + +declare double @llvm.asin.f64(double) + +declare double @llvm.acos.f64(double) + +declare double @llvm.atan.f64(double) + +declare double @llvm.atan2.f64(double, double) + +declare double @llvm.sinh.f64(double) + +declare double @llvm.cosh.f64(double) + +declare double @llvm.tanh.f64(double) + +declare hidden double @asinh(double) + +declare hidden float @asinhf(float) + +declare hidden double @acosh(double) + +declare hidden float @acoshf(float) + +declare hidden double @atanh(double) + +declare hidden float @atanhf(float) + +declare hidden double @erf(double) + +declare hidden float @erff(float) + +declare hidden double @erfc(double) + +declare hidden float @erfcf(float) + +declare hidden double @tgamma(double) + +declare hidden float @tgammaf(float) + +declare hidden double @lgamma(double) + +declare hidden float @lgammaf(float) diff --git a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll index 33fa2c3..f83352c 100644 --- a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll +++ b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll @@ -959,8 +959,8 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) { define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) { ; CHECK-LABEL: @wide_lengthening_splat( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i16> [[V:%.*]] to <4 x i8> +; CHECK-NEXT: [[TR:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i8> [[TR]] ; %shuf = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll index a85ce71..dfe9d94 100644 --- a/llvm/test/Transforms/InstCombine/trunc.ll +++ b/llvm/test/Transforms/InstCombine/trunc.ll @@ -960,8 +960,8 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) { define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) { ; CHECK-LABEL: @wide_lengthening_splat( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i16> [[V:%.*]] to <4 x i8> +; CHECK-NEXT: [[TR:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: ret <8 x i8> [[TR]] ; %shuf = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer @@ -969,6 +969,19 @@ define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) { ret <8 x i8> %tr } +; This is a negative test, we expect the trunc to remain after the shuffle as it +; might not be beneficial to preform trunc on a wider type +define <4 x i8> @wide_shortening_splat(<8 x i16> %v) { +; CHECK-LABEL: @wide_shortening_splat( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TR:%.*]] = trunc <4 x i16> [[SHUF]] to <4 x i8> +; CHECK-NEXT: ret <4 x i8> [[TR]] +; + %shuf = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer + %tr = trunc <4 x i16> %shuf to <4 x i8> + ret <4 x i8> %tr +} + define <2 x i8> @narrow_add_vec_constant(<2 x i32> %x) { ; CHECK-LABEL: @narrow_add_vec_constant( ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[X:%.*]] to <2 x i8> diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll index 672e949..b505917 100644 --- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -874,3 +874,79 @@ define void @load_factor2_fp128(ptr %ptr) { %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3> ret void } + +define void @load_factor2_f32(ptr %ptr) { +; RV32-LABEL: @load_factor2_f32( +; RV32-NEXT: [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2_f32( +; RV64-NEXT: [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x float>, ptr %ptr + %v0 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %v1 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret void +} + +define void @load_factor2_f64(ptr %ptr) { +; RV32-LABEL: @load_factor2_f64( +; RV32-NEXT: [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2_f64( +; RV64-NEXT: [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x double>, ptr %ptr + %v0 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %v1 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret void +} + +define void @load_factor2_bf16(ptr %ptr) { +; RV32-LABEL: @load_factor2_bf16( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32 +; RV32-NEXT: [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> +; RV32-NEXT: [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2_bf16( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32 +; RV64-NEXT: [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> +; RV64-NEXT: [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x bfloat>, ptr %ptr + %v0 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %v1 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret void +} + +define void @load_factor2_f16(ptr %ptr) { +; RV32-LABEL: @load_factor2_f16( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32 +; RV32-NEXT: [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> +; RV32-NEXT: [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2_f16( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32 +; RV64-NEXT: [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> +; RV64-NEXT: [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x half>, ptr %ptr + %v0 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> + %v1 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> + ret void +} diff --git a/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll index 357a7b6..65aaf72 100644 --- a/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopIdiom/reuse-lcssa-phi-scev-expansion.ll @@ -96,3 +96,68 @@ loop.3: exit: ret void } + +declare i1 @cond() + +define ptr @test_lcssa_reuse_preserve_lcssa() { +; CHECK-LABEL: define ptr @test_lcssa_reuse_preserve_lcssa() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[LOOP_0_HEADER:.*]] +; CHECK: [[LOOP_0_HEADER]]: +; CHECK-NEXT: br label %[[LOOP_1:.*]] +; CHECK: [[LOOP_1]]: +; CHECK-NEXT: [[IV_1:%.*]] = phi ptr [ null, %[[LOOP_0_HEADER]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP_1]] ] +; CHECK-NEXT: [[IV_1_NEXT]] = getelementptr i8, ptr [[IV_1]], i64 1 +; CHECK-NEXT: [[EC_1:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[EC_1]], label %[[THEN:.*]], label %[[LOOP_1]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[IV_1_LCSSA1:%.*]] = phi ptr [ [[IV_1]], %[[LOOP_1]] ] +; CHECK-NEXT: [[C_2:%.*]] = call i1 @cond() +; CHECK-NEXT: br i1 [[C_2]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_0_LATCH:.*]] +; CHECK: [[LOOP_2_PREHEADER]]: +; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi ptr [ [[IV_1_LCSSA1]], %[[THEN]] ] +; CHECK-NEXT: [[IV_1_LCSSA_LCSSA:%.*]] = phi ptr [ [[IV_1_LCSSA1]], %[[THEN]] ] +; CHECK-NEXT: [[STRLEN:%.*]] = call i64 @strlen(ptr null) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[IV_1_LCSSA]], i64 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[STRLEN]] +; CHECK-NEXT: br label %[[LOOP_2:.*]] +; CHECK: [[LOOP_2]]: +; CHECK-NEXT: [[IV_2:%.*]] = phi ptr [ [[RES:%.*]], %[[LOOP_2]] ], [ [[IV_1_LCSSA_LCSSA]], %[[LOOP_2_PREHEADER]] ] +; CHECK-NEXT: [[RES]] = getelementptr i8, ptr [[IV_2]], i64 1 +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[IV_1_LCSSA_LCSSA]], align 1 +; CHECK-NEXT: [[EC_2:%.*]] = icmp eq i8 [[L]], 0 +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[LOOP_2]] +; CHECK: [[LOOP_0_LATCH]]: +; CHECK-NEXT: br label %[[LOOP_0_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret ptr [[SCEVGEP]] +; +entry: + br label %loop.0.header + +loop.0.header: + br label %loop.1 + +loop.1: + %iv.1 = phi ptr [ null, %loop.0.header ], [ %iv.1.next, %loop.1 ] + %iv.1.next = getelementptr i8, ptr %iv.1, i64 1 + %ec.1 = call i1 @cond() + br i1 %ec.1, label %then, label %loop.1 + +then: + %c.2 = call i1 @cond() + br i1 %c.2, label %loop.2, label %loop.0.latch + +loop.2: + %iv.2 = phi ptr [ %res, %loop.2 ], [ %iv.1, %then ] + %res = getelementptr i8, ptr %iv.2, i64 1 + %l = load i8, ptr %iv.1, align 1 + %ec.2 = icmp eq i8 %l, 0 + br i1 %ec.2, label %exit, label %loop.2 + +loop.0.latch: + br label %loop.0.header + +exit: + ret ptr %res +} diff --git a/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll new file mode 100644 index 0000000..f74fb14 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/Hexagon/reuse-lcssa-phi-scev-expansion.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-unroll -S %s | FileCheck %s + +target triple = "hexagon-unknown-linux" + +declare void @foo() + +define void @preserve_lcssa_when_reusing_existing_phi() { +; CHECK-LABEL: define void @preserve_lcssa_when_reusing_existing_phi() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[LOOP_1_HEADER:.*]] +; CHECK: [[LOOP_1_HEADER]]: +; CHECK-NEXT: br label %[[LOOP_2_HEADER:.*]] +; CHECK: [[LOOP_2_HEADER]]: +; CHECK-NEXT: br label %[[LOOP_3:.*]] +; CHECK: [[LOOP_3]]: +; CHECK-NEXT: [[IV_3:%.*]] = phi i32 [ [[IV_3_NEXT:%.*]], %[[LOOP_3]] ], [ 0, %[[LOOP_2_HEADER]] ] +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: [[IV_3_NEXT]] = add i32 [[IV_3]], 1 +; CHECK-NEXT: br i1 false, label %[[PH:.*]], label %[[LOOP_3]] +; CHECK: [[PH]]: +; CHECK-NEXT: [[IV_3_LCSSA:%.*]] = phi i32 [ [[IV_3]], %[[LOOP_3]] ] +; CHECK-NEXT: br i1 true, label %[[LOOP_2_LATCH:.*]], label %[[LOOP_4_PREHEADER:.*]] +; CHECK: [[LOOP_4_PREHEADER]]: +; CHECK-NEXT: [[IV_3_LCSSA_LCSSA1:%.*]] = phi i32 [ [[IV_3_LCSSA]], %[[PH]] ] +; CHECK-NEXT: [[IV_3_LCSSA_LCSSA:%.*]] = phi i32 [ [[IV_3_LCSSA]], %[[PH]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IV_3_LCSSA_LCSSA1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = freeze i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP1]], 7 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 7 +; CHECK-NEXT: br i1 [[TMP3]], label %[[LOOP_1_LATCH_UNR_LCSSA:.*]], label %[[LOOP_4_PREHEADER_NEW:.*]] +; CHECK: [[LOOP_4_PREHEADER_NEW]]: +; CHECK-NEXT: br label %[[LOOP_4:.*]] +; CHECK: [[LOOP_2_LATCH]]: +; CHECK-NEXT: br label %[[LOOP_2_HEADER]] +; CHECK: [[LOOP_4]]: +; CHECK-NEXT: [[IV_4:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER_NEW]] ], [ [[INC_I_7:%.*]], %[[LOOP_4]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP_4]] ] +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: [[INC_I_7]] = add nuw nsw i32 [[IV_4]], 8 +; CHECK-NEXT: [[NITER_NEXT_7]] = add nuw nsw i32 [[NITER]], 8 +; CHECK-NEXT: br i1 true, label %[[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_4]] +; CHECK: [[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT]]: +; CHECK-NEXT: [[IV_4_UNR_PH:%.*]] = phi i32 [ [[INC_I_7]], %[[LOOP_4]] ] +; CHECK-NEXT: br label %[[LOOP_1_LATCH_UNR_LCSSA]] +; CHECK: [[LOOP_1_LATCH_UNR_LCSSA]]: +; CHECK-NEXT: [[IV_4_UNR:%.*]] = phi i32 [ 0, %[[LOOP_4_PREHEADER]] ], [ [[IV_4_UNR_PH]], %[[LOOP_1_LATCH_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_4_EPIL_PREHEADER:.*]], label %[[LOOP_1_LATCH:.*]] +; CHECK: [[LOOP_4_EPIL_PREHEADER]]: +; CHECK-NEXT: br label %[[LOOP_4_EPIL:.*]] +; CHECK: [[LOOP_4_EPIL]]: +; CHECK-NEXT: [[IV_4_EPIL:%.*]] = phi i32 [ [[INC_I_EPIL:%.*]], %[[LOOP_4_EPIL]] ], [ [[IV_4_UNR]], %[[LOOP_4_EPIL_PREHEADER]] ] +; CHECK-NEXT: [[EPIL_ITER:%.*]] = phi i32 [ 0, %[[LOOP_4_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_4_EPIL]] ] +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: [[INC_I_EPIL]] = add i32 [[IV_4_EPIL]], 1 +; CHECK-NEXT: [[EC_EPIL:%.*]] = icmp eq i32 [[IV_4_EPIL]], [[IV_3_LCSSA_LCSSA]] +; CHECK-NEXT: [[EPIL_ITER_NEXT]] = add i32 [[EPIL_ITER]], 1 +; CHECK-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_4_EPIL]], label %[[LOOP_1_LATCH_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[LOOP_1_LATCH_EPILOG_LCSSA]]: +; CHECK-NEXT: br label %[[LOOP_1_LATCH]] +; CHECK: [[LOOP_1_LATCH]]: +; CHECK-NEXT: br label %[[LOOP_1_HEADER]] +; +entry: + br label %loop.1.header + +loop.1.header: + br label %loop.2.header + +loop.2.header: + br label %loop.3 + +loop.3: + %iv.3 = phi i32 [ %iv.3.next, %loop.3 ], [ 0, %loop.2.header ] + call void @foo() + %iv.3.next = add i32 %iv.3, 1 + br i1 false, label %ph, label %loop.3 + +ph: + br i1 true, label %loop.2.latch, label %loop.4 + +loop.2.latch: + br label %loop.2.header + +loop.4: + %iv.4 = phi i32 [ 0, %ph ], [ %inc.i, %loop.4 ] + call void @foo() + %inc.i = add i32 %iv.4, 1 + %ec = icmp eq i32 %iv.4, %iv.3 + br i1 %ec, label %loop.1.latch, label %loop.4 + +loop.1.latch: + br label %loop.1.header +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll index 18cc3a8..0f407cd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -148,6 +148,104 @@ exit: ret void } +define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) { +; CHECK-LABEL: @main_vf_vscale_x_2_no_epi_iteration( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]] +; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1 +; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; +; CHECK-VF8-LABEL: @main_vf_vscale_x_2_no_epi_iteration( +; CHECK-VF8-NEXT: iter.check: +; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-VF8: vector.main.loop.iter.check: +; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF8: vector.ph: +; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 +; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-VF8-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 +; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF8: vector.body: +; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-VF8-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2 +; CHECK-VF8-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP8]] +; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP6]], align 1 +; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP9]], align 1 +; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-VF8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF8: middle.block: +; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK-VF8: vec.epilog.iter.check: +; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] +; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-VF8: vec.epilog.ph: +; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK-VF8: vec.epilog.vector.body: +; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]] +; CHECK-VF8-NEXT: store <8 x i64> splat (i64 1), ptr [[TMP11]], align 1 +; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 +; CHECK-VF8-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024 +; CHECK-VF8-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF8: vec.epilog.middle.block: +; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-VF8: vec.epilog.scalar.ph: +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF8: for.body: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv + store i64 1, ptr %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, 1024 + br i1 %exitcond, label %for.body, label %exit + +exit: + ret void +} ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_2' ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) @@ -163,20 +261,21 @@ exit: ; fixed-width VF=8 for the epilogue if the vectors are known to be ; sufficiently wide. This information can be deduced from vscale_range or ; VScaleForTuning (set by mcpu/mtune). -define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { +define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) { ; CHECK-LABEL: @main_vf_vscale_x_2( ; CHECK-NEXT: iter.check: -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -190,44 +289,48 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[SCALAR_PH]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]] -; CHECK-NEXT: store <8 x i64> splat (i64 1), ptr [[TMP20]], align 1 -; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024 -; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX4]] +; CHECK-NEXT: store <8 x i64> splat (i64 1), ptr [[TMP19]], align 1 +; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 8 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP20]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[SCALAR_PH]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: ; ; CHECK-VF8-LABEL: @main_vf_vscale_x_2( ; CHECK-VF8-NEXT: iter.check: -; CHECK-VF8-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N:%.*]], 8 +; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; CHECK-VF8: vector.main.loop.iter.check: ; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-VF8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] ; CHECK-VF8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-VF8: vector.ph: ; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4 -; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-VF8-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 ; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] @@ -241,28 +344,31 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-VF8-NEXT: store <vscale x 2 x i64> splat (i64 1), ptr [[TMP17]], align 1 ; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-VF8-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF8: middle.block: -; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK-VF8: vec.epilog.iter.check: -; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] +; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] ; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 ; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK-VF8: vec.epilog.ph: ; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-VF8-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8 +; CHECK-VF8-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] ; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK-VF8: vec.epilog.vector.body: ; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX1]] ; CHECK-VF8-NEXT: store <8 x i64> splat (i64 1), ptr [[TMP20]], align 1 ; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 -; CHECK-VF8-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024 -; CHECK-VF8-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-VF8-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC3]] +; CHECK-VF8-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-VF8: vec.epilog.middle.block: -; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-VF8-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] +; CHECK-VF8-NEXT: br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-VF8: vec.epilog.scalar.ph: -; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF8: for.body: ; @@ -274,7 +380,7 @@ for.body: %arrayidx = getelementptr inbounds i64, ptr %A, i64 %iv store i64 1, ptr %arrayidx, align 1 %iv.next = add nuw nsw i64 %iv, 1 - %exitcond = icmp ne i64 %iv.next, 1024 + %exitcond = icmp ne i64 %iv.next, %n br i1 %exitcond, label %for.body, label %exit exit: @@ -313,7 +419,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: store <vscale x 16 x i8> zeroinitializer, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -340,7 +446,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: store <vscale x 8 x i8> zeroinitializer, ptr [[TMP28]], align 1 ; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX7]], [[TMP26]] ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 10000, [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -376,7 +482,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8-NEXT: store <vscale x 16 x i8> zeroinitializer, ptr [[TMP17]], align 1 ; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-VF8-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK-VF8: middle.block: ; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -395,7 +501,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8-NEXT: store <8 x i8> zeroinitializer, ptr [[TMP20]], align 1 ; CHECK-VF8-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX3]], 8 ; CHECK-VF8-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 10000 -; CHECK-VF8-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-VF8-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-VF8: vec.epilog.middle.block: ; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK-VF8: vec.epilog.scalar.ph: @@ -420,4 +526,307 @@ exit: ret void } +; Loop with vscale-based trip count vscale x 1033. +define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 { +; CHECK-LABEL: @trip_count_vscale( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[V:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N:%.*]] = mul nuw nsw i64 [[V]], 1033 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP11]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP16]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]] +; CHECK-NEXT: store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP25]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 2 +; CHECK-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[N]], [[TMP27]] +; CHECK-NEXT: [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]] +; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 2 x float>, ptr [[TMP30]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 2 x float>, ptr [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = fmul <vscale x 2 x float> [[WIDE_LOAD8]], [[WIDE_LOAD9]] +; CHECK-NEXT: store <vscale x 2 x float> [[TMP34]], ptr [[TMP32]], align 4 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], [[TMP29]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; +; CHECK-VF8-LABEL: @trip_count_vscale( +; CHECK-VF8-NEXT: entry: +; CHECK-VF8-NEXT: [[V:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[N:%.*]] = mul nuw nsw i64 [[V]], 1033 +; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 +; CHECK-VF8-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF8: vector.ph: +; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF8-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 +; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF8: vector.body: +; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-VF8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP9]] +; CHECK-VF8-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4 +; CHECK-VF8-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4 +; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4 +; CHECK-VF8-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP14]] +; CHECK-VF8-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4 +; CHECK-VF8-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4 +; CHECK-VF8-NEXT: [[TMP16:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-VF8-NEXT: [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]] +; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4 +; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP19]] +; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP16]], ptr [[TMP11]], align 4 +; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP17]], ptr [[TMP20]], align 4 +; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-VF8-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-VF8: middle.block: +; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF8: scalar.ph: +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF8: for.body: +; +entry: + %v = tail call i64 @llvm.vscale.i64() + %n = mul nuw nsw i64 %v, 1033 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv + %l.a = load float, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv + %l.b = load float, ptr %arrayidx3, align 4 + %mul4 = fmul float %l.a, %l.b + store float %mul4, ptr %arrayidx3, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %for.body + +exit: + ret void +} + +; Loop with vscale-based trip count vscale x 1024. +; TODO: No epilogue vectorizations should remain when choosing VF = vscale x 4. +define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalias %b) vscale_range(1, 16) #0 { +; CHECK-LABEL: @trip_count_vscale_no_epilogue_iterations( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[V:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N:%.*]] = mul nuw nsw i64 [[V]], 1024 +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw i64 [[TMP10]], 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP11]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP15]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP16]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[TMP21]] +; CHECK-NEXT: store <vscale x 4 x float> [[TMP18]], ptr [[TMP13]], align 4 +; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul nuw i64 [[TMP24]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP25]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul nuw i64 [[TMP26]], 2 +; CHECK-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[N]], [[TMP27]] +; CHECK-NEXT: [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]] +; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 2 +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 2 x float>, ptr [[TMP30]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX7]] +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 2 x float>, ptr [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = fmul <vscale x 2 x float> [[WIDE_LOAD8]], [[WIDE_LOAD9]] +; CHECK-NEXT: store <vscale x 2 x float> [[TMP34]], ptr [[TMP32]], align 4 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], [[TMP29]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC6]] +; CHECK-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; +; CHECK-VF8-LABEL: @trip_count_vscale_no_epilogue_iterations( +; CHECK-VF8-NEXT: entry: +; CHECK-VF8-NEXT: [[V:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[N:%.*]] = mul nuw nsw i64 [[V]], 1024 +; CHECK-VF8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 +; CHECK-VF8-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-VF8: vector.ph: +; CHECK-VF8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-VF8-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 +; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-VF8: vector.body: +; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-VF8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP9]] +; CHECK-VF8-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4 +; CHECK-VF8-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP10]], align 4 +; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 4 +; CHECK-VF8-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP14]] +; CHECK-VF8-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4 +; CHECK-VF8-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP15]], align 4 +; CHECK-VF8-NEXT: [[TMP16:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-VF8-NEXT: [[TMP17:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]] +; CHECK-VF8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP18]], 4 +; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[TMP19]] +; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP16]], ptr [[TMP11]], align 4 +; CHECK-VF8-NEXT: store <vscale x 4 x float> [[TMP17]], ptr [[TMP20]], align 4 +; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-VF8-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-VF8: middle.block: +; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-VF8: scalar.ph: +; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-VF8: for.body: +; +entry: + %v = tail call i64 @llvm.vscale.i64() + %n = mul nuw nsw i64 %v, 1024 + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %iv + %l.a = load float, ptr %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds nuw float, ptr %b, i64 %iv + %l.b = load float, ptr %arrayidx3, align 4 + %mul4 = fmul float %l.a, %l.b + store float %mul4, ptr %arrayidx3, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %for.body + +exit: + ret void +} + attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll index d5b25bf..21266e5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFBFMIN +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s -check-prefix=NO-ZVFBFMIN-PREDICATED ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -S | FileCheck %s -check-prefix=ZVFBFMIN define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { @@ -21,6 +22,24 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { ; NO-ZVFBFMIN: [[EXIT]]: ; NO-ZVFBFMIN-NEXT: ret void ; +; NO-ZVFBFMIN-PREDICATED-LABEL: define void @fadd( +; NO-ZVFBFMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-ZVFBFMIN-PREDICATED-NEXT: [[ENTRY:.*]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: br label %[[LOOP:.*]] +; NO-ZVFBFMIN-PREDICATED: [[LOOP]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[Z:%.*]] = fadd bfloat [[X]], [[Y]] +; NO-ZVFBFMIN-PREDICATED-NEXT: store bfloat [[Z]], ptr [[A_GEP]], align 2 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; NO-ZVFBFMIN-PREDICATED-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; NO-ZVFBFMIN-PREDICATED: [[EXIT]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: ret void +; ; ZVFBFMIN-LABEL: define void @fadd( ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; ZVFBFMIN-NEXT: [[ENTRY:.*]]: @@ -133,6 +152,54 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 ; NO-ZVFBFMIN: [[EXIT]]: ; NO-ZVFBFMIN-NEXT: ret void ; +; NO-ZVFBFMIN-PREDICATED-LABEL: define void @vfwmaccbf16.vv( +; NO-ZVFBFMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; NO-ZVFBFMIN-PREDICATED-NEXT: [[ENTRY:.*]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; NO-ZVFBFMIN-PREDICATED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-ZVFBFMIN-PREDICATED: [[VECTOR_PH]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-ZVFBFMIN-PREDICATED-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-ZVFBFMIN-PREDICATED: [[VECTOR_BODY]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: [[I:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <4 x bfloat>, ptr [[A_GEP]], align 2 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = load <4 x bfloat>, ptr [[B_GEP]], align 2 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = load <4 x float>, ptr [[C_GEP]], align 4 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[TMP4:%.*]] = fpext <4 x bfloat> [[WIDE_MASKED_LOAD]] to <4 x float> +; NO-ZVFBFMIN-PREDICATED-NEXT: [[TMP5:%.*]] = fpext <4 x bfloat> [[WIDE_MASKED_LOAD3]] to <4 x float> +; NO-ZVFBFMIN-PREDICATED-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[WIDE_MASKED_LOAD4]]) +; NO-ZVFBFMIN-PREDICATED-NEXT: store <4 x float> [[TMP6]], ptr [[C_GEP]], align 4 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[I]], 4 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-ZVFBFMIN-PREDICATED-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-ZVFBFMIN-PREDICATED: [[MIDDLE_BLOCK]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-ZVFBFMIN-PREDICATED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; NO-ZVFBFMIN-PREDICATED: [[SCALAR_PH]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-ZVFBFMIN-PREDICATED-NEXT: br label %[[LOOP:.*]] +; NO-ZVFBFMIN-PREDICATED: [[LOOP]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[A_GEP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I1]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[B_GEP1:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I1]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[C_GEP1:%.*]] = getelementptr float, ptr [[C]], i64 [[I1]] +; NO-ZVFBFMIN-PREDICATED-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP1]], align 2 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP1]], align 2 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[Z:%.*]] = load float, ptr [[C_GEP1]], align 4 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[X_EXT:%.*]] = fpext bfloat [[X]] to float +; NO-ZVFBFMIN-PREDICATED-NEXT: [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float +; NO-ZVFBFMIN-PREDICATED-NEXT: [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]]) +; NO-ZVFBFMIN-PREDICATED-NEXT: store float [[FMULADD]], ptr [[C_GEP1]], align 4 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[I_NEXT]] = add i64 [[I1]], 1 +; NO-ZVFBFMIN-PREDICATED-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; NO-ZVFBFMIN-PREDICATED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-ZVFBFMIN-PREDICATED: [[EXIT]]: +; NO-ZVFBFMIN-PREDICATED-NEXT: ret void +; ; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv( ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; ZVFBFMIN-NEXT: [[ENTRY:.*]]: @@ -213,6 +280,11 @@ exit: ; NO-ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; NO-ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ;. +; NO-ZVFBFMIN-PREDICATED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NO-ZVFBFMIN-PREDICATED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-ZVFBFMIN-PREDICATED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-ZVFBFMIN-PREDICATED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. ; ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll index 5b56552..53e43e1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s -check-prefix=NO-ZVFHMIN-PREDICATED ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { @@ -21,6 +22,24 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { ; NO-ZVFHMIN: [[EXIT]]: ; NO-ZVFHMIN-NEXT: ret void ; +; NO-ZVFHMIN-PREDICATED-LABEL: define void @fadd( +; NO-ZVFHMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-ZVFHMIN-PREDICATED-NEXT: [[ENTRY:.*]]: +; NO-ZVFHMIN-PREDICATED-NEXT: br label %[[LOOP:.*]] +; NO-ZVFHMIN-PREDICATED: [[LOOP]]: +; NO-ZVFHMIN-PREDICATED-NEXT: [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; NO-ZVFHMIN-PREDICATED-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]] +; NO-ZVFHMIN-PREDICATED-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]] +; NO-ZVFHMIN-PREDICATED-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2 +; NO-ZVFHMIN-PREDICATED-NEXT: [[Y:%.*]] = load half, ptr [[B_GEP]], align 2 +; NO-ZVFHMIN-PREDICATED-NEXT: [[Z:%.*]] = fadd half [[X]], [[Y]] +; NO-ZVFHMIN-PREDICATED-NEXT: store half [[Z]], ptr [[A_GEP]], align 2 +; NO-ZVFHMIN-PREDICATED-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; NO-ZVFHMIN-PREDICATED-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; NO-ZVFHMIN-PREDICATED-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; NO-ZVFHMIN-PREDICATED: [[EXIT]]: +; NO-ZVFHMIN-PREDICATED-NEXT: ret void +; ; ZVFHMIN-LABEL: define void @fadd( ; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; ZVFHMIN-NEXT: [[ENTRY:.*]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll index b7d7fc1..e0594ad 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-inloop-reduction.ll @@ -122,35 +122,34 @@ for.end: define i32 @mul(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @mul( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 8 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3> -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) -; IF-EVL-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> splat (i32 1) +; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4 +; IF-EVL-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]]) ; IF-EVL-NEXT: [[MUL]] = mul i32 [[TMP5]], [[RDX]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD2]]) +; IF-EVL-NEXT: [[TMP6]] = mul i32 [[TMP8]], [[VEC_PHI1]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 8 ; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP6]], [[MUL]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -159,10 +158,10 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MUL1]] = mul nsw i32 [[TMP0]], [[RDX1]] ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[FOR_BODY1]] ], [ [[MUL]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL1]], [[FOR_BODY1]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[MUL_LCSSA]] ; ; NO-VP-LABEL: @mul( @@ -1114,35 +1113,34 @@ for.end: define float @fmul(ptr %a, i64 %n, float %start) { ; IF-EVL-LABEL: @fmul( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 8 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[RDX:%.*]] = phi float [ [[START:%.*]], [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3> -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP2]], i32 4, <4 x i1> [[TMP1]], <4 x float> poison) -; IF-EVL-NEXT: [[TMP4:%.*]] = select reassoc <4 x i1> [[TMP1]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> splat (float 1.000000e+00) +; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; IF-EVL-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; IF-EVL-NEXT: [[TMP5:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]]) ; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP5]], [[RDX]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4 +; IF-EVL-NEXT: [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[WIDE_LOAD2]]) +; IF-EVL-NEXT: [[TMP6]] = fmul reassoc float [[TMP8]], [[VEC_PHI1]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 8 ; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[BIN_RDX:%.*]] = fmul reassoc float [[TMP6]], [[MUL]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1151,10 +1149,10 @@ define float @fmul(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MUL1]] = fmul reassoc float [[TMP0]], [[RDX1]] ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: for.end: -; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL1]], [[FOR_BODY1]] ], [ [[MUL]], [[MIDDLE_BLOCK]] ] +; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL1]], [[FOR_BODY1]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret float [[MUL_LCSSA]] ; ; NO-VP-LABEL: @fmul( @@ -1449,37 +1447,35 @@ for.end: define float @fminimum(ptr %a, i64 %n, float %start) { ; IF-EVL-LABEL: @fminimum( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison) +; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4 ; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]]) -; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8 +; IF-EVL-NEXT: [[TMP3]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]]) ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1488,7 +1484,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]]) ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] @@ -1559,37 +1555,35 @@ for.end: define float @fmaximum(ptr %a, i64 %n, float %start) { ; IF-EVL-LABEL: @fmaximum( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison) +; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4 ; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]]) -; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8 +; IF-EVL-NEXT: [[TMP3]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]]) ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1598,7 +1592,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]]) ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP32:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll index d2dc482..3a963b0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-reduction.ll @@ -123,36 +123,34 @@ for.end: define i32 @mul(ptr %a, i64 %n, i32 %start) { ; IF-EVL-LABEL: @mul( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> splat (i32 1), i32 [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ [[TMP9]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> -; IF-EVL-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ splat (i32 1), [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP2]], <8 x i32> poison) +; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 8 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <8 x i32>, ptr [[TMP3]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4 ; IF-EVL-NEXT: [[TMP5]] = mul <8 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP2]], <8 x i32> [[TMP5]], <8 x i32> [[VEC_PHI]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8 +; IF-EVL-NEXT: [[TMP4]] = mul <8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP4]], [[TMP5]] ; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP6]]) -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -161,7 +159,7 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MUL]] = mul nsw i32 [[TMP0]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] @@ -1156,36 +1154,34 @@ for.end: define float @fmul(ptr %a, i64 %n, float %start) { ; IF-EVL-LABEL: @fmul( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP9:%.*]] = insertelement <8 x float> splat (float 1.000000e+00), float [[START:%.*]], i32 0 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[TMP9]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> -; IF-EVL-NEXT: [[TMP2:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT2]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float 1.000000e+00), [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP3]], i32 4, <8 x i1> [[TMP2]], <8 x float> poison) +; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 8 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 ; IF-EVL-NEXT: [[TMP5]] = fmul reassoc <8 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]] -; IF-EVL-NEXT: [[TMP6:%.*]] = select reassoc <8 x i1> [[TMP2]], <8 x float> [[TMP5]], <8 x float> [[VEC_PHI]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8 +; IF-EVL-NEXT: [[TMP4]] = fmul reassoc <8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP6:%.*]] = fmul reassoc <8 x float> [[TMP4]], [[TMP5]] ; IF-EVL-NEXT: [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[TMP6]]) -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1194,7 +1190,7 @@ define float @fmul(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MUL]] = fmul reassoc float [[TMP0]], [[RDX]] ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY1]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] @@ -1502,37 +1498,35 @@ for.end: define float @fminimum(ptr %a, i64 %n, float %start) { ; IF-EVL-LABEL: @fminimum( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison) +; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4 ; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]]) -; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8 +; IF-EVL-NEXT: [[TMP3]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.minimum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]]) ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1541,7 +1535,7 @@ define float @fminimum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MIN]] = tail call float @llvm.minimum.f32(float [[RDX]], float [[TMP0]]) ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MIN_LCSSA:%.*]] = phi float [ [[MIN]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] @@ -1612,37 +1606,35 @@ for.end: define float @fmaximum(ptr %a, i64 %n, float %start) { ; IF-EVL-LABEL: @fmaximum( ; IF-EVL-NEXT: entry: -; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; IF-EVL-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N_RND_UP:%.*]], 16 +; IF-EVL-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; IF-EVL: vector.ph: -; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 7 -; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 +; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x float> poison, float [[START:%.*]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT2]], <8 x float> poison, <8 x i32> zeroinitializer ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> -; IF-EVL-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT4]] +; IF-EVL-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ [[BROADCAST_SPLAT]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IV]] -; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0(ptr [[TMP2]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison) +; IF-EVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 8 +; IF-EVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; IF-EVL-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP1]], align 4 ; IF-EVL-NEXT: [[TMP4]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI]], <8 x float> [[WIDE_MASKED_LOAD]]) -; IF-EVL-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[VEC_PHI]] -; IF-EVL-NEXT: [[IV_NEXT]] = add i64 [[IV]], 8 +; IF-EVL-NEXT: [[TMP3]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[VEC_PHI1]], <8 x float> [[WIDE_LOAD2]]) +; IF-EVL-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 ; IF-EVL-NEXT: [[TMP6:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; IF-EVL: middle.block: +; IF-EVL-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.maximum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP3]]) ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br label [[FOR_END:%.*]] +; IF-EVL-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_RND_UP]], [[N_VEC]] +; IF-EVL-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1651,7 +1643,7 @@ define float @fmaximum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[MAX]] = tail call float @llvm.maximum.f32(float [[RDX]], float [[TMP0]]) ; IF-EVL-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] +; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N_RND_UP]] ; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP32:![0-9]+]] ; IF-EVL: for.end: ; IF-EVL-NEXT: [[MAX_LCSSA:%.*]] = phi float [ [[MAX]], [[FOR_BODY1]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll index 27d7bd0..4da31a0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll @@ -25,13 +25,7 @@ define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) { ; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[AVL:%.*]] = sub i64 9, [[EVL_BASED_IV]] ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[EVL_BASED_IV]] -; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP5]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]]) -; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 8 x i8> [[VP_OP_LOAD]] to <vscale x 8 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = mul <vscale x 8 x i16> zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = lshr <vscale x 8 x i16> [[TMP12]], splat (i16 1) -; CHECK-NEXT: [[TMP14:%.*]] = trunc <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x i8> -; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP14]], <vscale x 8 x ptr> align 1 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]]) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> zeroinitializer, <vscale x 8 x ptr> align 1 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll index 27abddf..bb2e099 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll @@ -39,15 +39,13 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count) ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP13]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP15:%.*]] = zext <vscale x 8 x i8> [[VP_OP_LOAD]] to <vscale x 8 x i32> -; CHECK-NEXT: [[VP_OP:%.*]] = mul <vscale x 8 x i32> [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = ashr <vscale x 8 x i32> [[TMP15]], zeroinitializer ; CHECK-NEXT: [[VP_OP3:%.*]] = or <vscale x 8 x i32> [[TMP23]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = icmp ult <vscale x 8 x i32> [[TMP15]], zeroinitializer ; CHECK-NEXT: [[TMP17:%.*]] = select <vscale x 8 x i1> [[TMP16]], <vscale x 8 x i32> [[VP_OP3]], <vscale x 8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP24:%.*]] = trunc <vscale x 8 x i32> [[TMP17]] to <vscale x 8 x i8> ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i8.nxv8p0(<vscale x 8 x i8> [[TMP24]], <vscale x 8 x ptr> align 1 [[BROADCAST_SPLAT]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc <vscale x 8 x i32> [[VP_OP]] to <vscale x 8 x i16> -; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> [[TMP19]], <vscale x 8 x ptr> align 2 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x ptr> align 2 zeroinitializer, <vscale x 8 x i1> splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll index 1ad75bb..3d44317 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll @@ -281,3 +281,147 @@ exit: %res = phi i64 [ %iv, %loop.header ], [ 1, %loop.latch ] ret i64 %res } + +define i8 @test_early_exit_max_vector_tc_eq_16(ptr dereferenceable(17) %A) nosync nofree { +; VF8UF1-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16( +; VF8UF1-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] { +; VF8UF1-NEXT: [[ENTRY:.*]]: +; VF8UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF8UF1: [[VECTOR_PH]]: +; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF8UF1: [[VECTOR_BODY]]: +; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF8UF1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; VF8UF1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 +; VF8UF1-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer +; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; VF8UF1-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP1]]) +; VF8UF1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; VF8UF1-NEXT: [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]] +; VF8UF1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF1: [[MIDDLE_SPLIT]]: +; VF8UF1-NEXT: br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; VF8UF1: [[MIDDLE_BLOCK]]: +; VF8UF1-NEXT: br label %[[SCALAR_PH]] +; VF8UF1: [[VECTOR_EARLY_EXIT]]: +; VF8UF1-NEXT: br label %[[EXIT:.*]] +; VF8UF1: [[SCALAR_PH]]: +; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF8UF1-NEXT: br label %[[LOOP_HEADER:.*]] +; VF8UF1: [[LOOP_HEADER]]: +; VF8UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF8UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; VF8UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF8UF1-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 +; VF8UF1-NEXT: br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]] +; VF8UF1: [[LOOP_LATCH]]: +; VF8UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; VF8UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; VF8UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF1: [[EXIT]]: +; VF8UF1-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ] +; VF8UF1-NEXT: ret i8 [[RES]] +; +; VF8UF2-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16( +; VF8UF2-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] { +; VF8UF2-NEXT: [[ENTRY:.*]]: +; VF8UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF8UF2: [[VECTOR_PH]]: +; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF8UF2: [[VECTOR_BODY]]: +; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 8 +; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1 +; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; VF8UF2-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer +; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD1]], zeroinitializer +; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; VF8UF2-NEXT: [[TMP4:%.*]] = or <8 x i1> [[TMP2]], [[TMP3]] +; VF8UF2-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP4]]) +; VF8UF2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; VF8UF2-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; VF8UF2-NEXT: br i1 [[TMP7]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF2: [[MIDDLE_SPLIT]]: +; VF8UF2-NEXT: br i1 [[TMP5]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; VF8UF2: [[MIDDLE_BLOCK]]: +; VF8UF2-NEXT: br label %[[SCALAR_PH]] +; VF8UF2: [[VECTOR_EARLY_EXIT]]: +; VF8UF2-NEXT: br label %[[EXIT:.*]] +; VF8UF2: [[SCALAR_PH]]: +; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF8UF2-NEXT: br label %[[LOOP_HEADER:.*]] +; VF8UF2: [[LOOP_HEADER]]: +; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; VF8UF2-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF8UF2-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 +; VF8UF2-NEXT: br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]] +; VF8UF2: [[LOOP_LATCH]]: +; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF2: [[EXIT]]: +; VF8UF2-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ] +; VF8UF2-NEXT: ret i8 [[RES]] +; +; VF16UF1-LABEL: define i8 @test_early_exit_max_vector_tc_eq_16( +; VF16UF1-SAME: ptr dereferenceable(17) [[A:%.*]]) #[[ATTR0]] { +; VF16UF1-NEXT: [[ENTRY:.*]]: +; VF16UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF16UF1: [[VECTOR_PH]]: +; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF16UF1: [[VECTOR_BODY]]: +; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 +; VF16UF1-NEXT: [[TMP1:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer +; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; VF16UF1-NEXT: [[TMP2:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP1]]) +; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; VF16UF1-NEXT: [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]] +; VF16UF1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF16UF1: [[MIDDLE_SPLIT]]: +; VF16UF1-NEXT: br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; VF16UF1: [[MIDDLE_BLOCK]]: +; VF16UF1-NEXT: br label %[[SCALAR_PH]] +; VF16UF1: [[VECTOR_EARLY_EXIT]]: +; VF16UF1-NEXT: br label %[[EXIT:.*]] +; VF16UF1: [[SCALAR_PH]]: +; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF16UF1-NEXT: br label %[[LOOP_HEADER:.*]] +; VF16UF1: [[LOOP_HEADER]]: +; VF16UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] +; VF16UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF16UF1-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 +; VF16UF1-NEXT: br i1 [[C]], label %[[EXIT]], label %[[LOOP_LATCH]] +; VF16UF1: [[LOOP_LATCH]]: +; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; VF16UF1: [[EXIT]]: +; VF16UF1-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ] +; VF16UF1-NEXT: ret i8 [[RES]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %p.src = getelementptr inbounds i8, ptr %A, i64 %iv + %l = load i8, ptr %p.src, align 1 + %c = icmp eq i8 %l, 0 + br i1 %c, label %exit, label %loop.latch + +loop.latch: + %iv.next = add nsw i64 %iv, 1 + %cmp = icmp eq i64 %iv.next, 17 + br i1 %cmp, label %exit, label %loop.header + +exit: + %res = phi i8 [ 0, %loop.header ], [ 1, %loop.latch ] + ret i8 %res +} + + diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index b396e29..59c76ae 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -1218,6 +1218,133 @@ exit: ret void } +define void @test_vector_tc_eq_16(ptr %A) { +; VF8UF1-LABEL: define void @test_vector_tc_eq_16( +; VF8UF1-SAME: ptr [[A:%.*]]) { +; VF8UF1-NEXT: [[ENTRY:.*]]: +; VF8UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF8UF1: [[VECTOR_PH]]: +; VF8UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16 +; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF8UF1: [[VECTOR_BODY]]: +; VF8UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF8UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; VF8UF1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1 +; VF8UF1-NEXT: [[TMP1:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10) +; VF8UF1-NEXT: store <8 x i8> [[TMP1]], ptr [[NEXT_GEP]], align 1 +; VF8UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; VF8UF1-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; VF8UF1-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VF8UF1: [[MIDDLE_BLOCK]]: +; VF8UF1-NEXT: br label %[[SCALAR_PH]] +; VF8UF1: [[SCALAR_PH]]: +; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF8UF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ] +; VF8UF1-NEXT: br label %[[LOOP:.*]] +; VF8UF1: [[LOOP]]: +; VF8UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF8UF1-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ] +; VF8UF1-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1 +; VF8UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF8UF1-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10 +; VF8UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 +; VF8UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; VF8UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; VF8UF1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; VF8UF1: [[EXIT]]: +; VF8UF1-NEXT: ret void +; +; VF8UF2-LABEL: define void @test_vector_tc_eq_16( +; VF8UF2-SAME: ptr [[A:%.*]]) { +; VF8UF2-NEXT: [[ENTRY:.*]]: +; VF8UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF8UF2: [[VECTOR_PH]]: +; VF8UF2-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16 +; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF8UF2: [[VECTOR_BODY]]: +; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF8UF2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; VF8UF2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 8 +; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[NEXT_GEP]], align 1 +; VF8UF2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; VF8UF2-NEXT: [[TMP2:%.*]] = add nsw <8 x i8> [[WIDE_LOAD]], splat (i8 10) +; VF8UF2-NEXT: [[TMP3:%.*]] = add nsw <8 x i8> [[WIDE_LOAD1]], splat (i8 10) +; VF8UF2-NEXT: store <8 x i8> [[TMP2]], ptr [[NEXT_GEP]], align 1 +; VF8UF2-NEXT: store <8 x i8> [[TMP3]], ptr [[TMP1]], align 1 +; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; VF8UF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; VF8UF2-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF8UF2: [[MIDDLE_BLOCK]]: +; VF8UF2-NEXT: br label %[[SCALAR_PH]] +; VF8UF2: [[SCALAR_PH]]: +; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF8UF2-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ] +; VF8UF2-NEXT: br label %[[LOOP:.*]] +; VF8UF2: [[LOOP]]: +; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF8UF2-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ] +; VF8UF2-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1 +; VF8UF2-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF8UF2-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10 +; VF8UF2-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 +; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; VF8UF2: [[EXIT]]: +; VF8UF2-NEXT: ret void +; +; VF16UF1-LABEL: define void @test_vector_tc_eq_16( +; VF16UF1-SAME: ptr [[A:%.*]]) { +; VF16UF1-NEXT: [[ENTRY:.*]]: +; VF16UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF16UF1: [[VECTOR_PH]]: +; VF16UF1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 16 +; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF16UF1: [[VECTOR_BODY]]: +; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF16UF1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; VF16UF1-NEXT: [[TMP1:%.*]] = add nsw <16 x i8> [[WIDE_LOAD]], splat (i8 10) +; VF16UF1-NEXT: store <16 x i8> [[TMP1]], ptr [[NEXT_GEP]], align 1 +; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; VF16UF1-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; VF16UF1-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF16UF1: [[MIDDLE_BLOCK]]: +; VF16UF1-NEXT: br label %[[SCALAR_PH]] +; VF16UF1: [[SCALAR_PH]]: +; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF16UF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ [[A]], %[[ENTRY]] ] +; VF16UF1-NEXT: br label %[[LOOP:.*]] +; VF16UF1: [[LOOP]]: +; VF16UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF16UF1-NEXT: [[P_SRC:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[P_SRC_NEXT:%.*]], %[[LOOP]] ] +; VF16UF1-NEXT: [[P_SRC_NEXT]] = getelementptr inbounds i8, ptr [[P_SRC]], i64 1 +; VF16UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 +; VF16UF1-NEXT: [[ADD:%.*]] = add nsw i8 [[L]], 10 +; VF16UF1-NEXT: store i8 [[ADD]], ptr [[P_SRC]], align 1 +; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 +; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; VF16UF1: [[EXIT]]: +; VF16UF1-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ] + %p.src.next = getelementptr inbounds i8, ptr %p.src, i64 1 + %l = load i8, ptr %p.src, align 1 + %add = add nsw i8 %l, 10 + store i8 %add, ptr %p.src + %iv.next = add nsw i64 %iv, 1 + %cmp = icmp eq i64 %iv.next, 17 + br i1 %cmp, label %exit, label %loop + +exit: + ret void +} ;. ; VF8UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; VF8UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -1227,6 +1354,8 @@ exit: ; VF8UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} ; VF8UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} ; VF8UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; VF8UF1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; VF8UF1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} ;. ; VF8UF2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; VF8UF2: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} @@ -1234,6 +1363,8 @@ exit: ; VF8UF2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF8UF2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; VF8UF2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; VF8UF2: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; VF8UF2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} ;. ; VF16UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; VF16UF1: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} @@ -1241,4 +1372,6 @@ exit: ; VF16UF1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ; VF16UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; VF16UF1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; VF16UF1: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]} +; VF16UF1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} ;. diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll b/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll new file mode 100644 index 0000000..d0450e0 --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/func_assign_fix.ll @@ -0,0 +1,130 @@ +;; Make sure we assign the original callsite to a function clone (which will be +;; the original function clone), even when we cannot update its caller (due to +;; missing metadata e.g. from mismatched profiles). Otherwise we will try to use +;; the original function for a different clone, leading to confusion later when +;; rewriting the calls. + +;; -stats requires asserts +; REQUIRES: asserts + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats -debug \ +; RUN: -pass-remarks=memprof-context-disambiguation %s -S 2>&1 | \ +; RUN: FileCheck %s --implicit-check-not="Mismatch in call clone assignment" \ +; RUN: --implicit-check-not="Number of callsites assigned to call multiple non-matching clones" + + +; ModuleID = '<stdin>' +source_filename = "reduced.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: define void @A() +define void @A() { + ; CHECK: call void @C() + call void @C() + ret void +} + +; CHECK-LABEL: define void @B() +define void @B() { + ; CHECK: call void @C.memprof.1() + call void @C(), !callsite !1 + ret void +} + +; CHECK-LABEL: define void @C() +define void @C() { + ; CHECK: call void @F() + call void @F(), !callsite !16 + ; CHECK: call void @D() + call void @D(), !callsite !2 + ret void +} + +; CHECK-LABEL: define void @D() +define void @D() { + ; CHECK: call void @E() + call void @E(), !callsite !3 + ; CHECK: call void @G() + call void @G(), !callsite !17 + ret void +} + +; CHECK-LABEL: define void @E() +define void @E() { + ; CHECK: call ptr @_Znwm(i64 0) #[[NOTCOLD:[0-9]+]] + %1 = call ptr @_Znwm(i64 0), !memprof !4, !callsite !9 + ret void +} + +; CHECK-LABEL: define void @F() +define void @F() { + ; CHECK: call void @G() + call void @G(), !callsite !17 + ret void +} + +; CHECK-LABEL: define void @G() +define void @G() { + ; CHECK: call ptr @_Znwm(i64 0) #[[NOTCOLD]] + %2 = call ptr @_Znwm(i64 0), !memprof !10, !callsite !15 + ret void +} + +; CHECK-LABEL: define void @A1() +define void @A1() { + ; CHECK: call void @C() + call void @C(), !callsite !18 + ret void +} + +; CHECK-LABEL: define void @B1() +define void @B1() { + ; CHECK: call void @C.memprof.1() + call void @C(), !callsite !19 + ret void +} + +; CHECK-LABEL: define void @C.memprof.1() + ; CHECK: call void @F.memprof.1() + ; CHECK: call void @D.memprof.1() + +; CHECK-LABEL: define void @D.memprof.1() + ; CHECK: call void @E.memprof.1() + ; CHECK: call void @G() + +; CHECK-LABEL: define void @E.memprof.1() + ; CHECK: call ptr @_Znwm(i64 0) #[[COLD:[0-9]+]] + +; CHECK-LABEL: define void @F.memprof.1() + ; CHECK: call void @G.memprof.1() + +; CHECK-LABEL: define void @G.memprof.1() + ; CHECK: call ptr @_Znwm(i64 0) #[[COLD]] + +declare ptr @_Znwm(i64) + +; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" } +; IR: attributes #[[COLD]] = { "memprof"="cold" } + +!0 = !{i64 123} +!1 = !{i64 234} +!2 = !{i64 345} +!3 = !{i64 456} +!4 = !{!5, !7} +!5 = !{!6, !"notcold"} +!6 = !{i64 567, i64 456, i64 345, i64 123} +!7 = !{!8, !"cold"} +!8 = !{i64 567, i64 456, i64 345, i64 234} +!9 = !{i64 567} +!10 = !{!11, !13} +!11 = !{!12, !"notcold"} +!12 = !{i64 678, i64 891, i64 789, i64 912} +!13 = !{!14, !"cold"} +!14 = !{i64 678, i64 891, i64 789, i64 812} +!15 = !{i64 678} +!16 = !{i64 789} +!17 = !{i64 891} +!18 = !{i64 912} +!19 = !{i64 812} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index 51cafac..e1da112 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -10,15 +10,15 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: .reg .b64 %rd<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [caller_St8x4_param_0+8]; -; CHECK-NEXT: ld.param.b64 %rd2, [caller_St8x4_param_0]; -; CHECK-NEXT: ld.param.b64 %rd3, [caller_St8x4_param_0+24]; -; CHECK-NEXT: ld.param.b64 %rd4, [caller_St8x4_param_0+16]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[32]; -; CHECK-NEXT: st.param.v2.b64 [param0], {%rd2, %rd1}; -; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd4, %rd3}; ; CHECK-NEXT: .param .align 16 .b8 retval0[32]; +; CHECK-NEXT: ld.param.b64 %rd1, [caller_St8x4_param_0+24]; +; CHECK-NEXT: ld.param.b64 %rd2, [caller_St8x4_param_0+16]; +; CHECK-NEXT: st.param.v2.b64 [param0+16], {%rd2, %rd1}; +; CHECK-NEXT: ld.param.b64 %rd3, [caller_St8x4_param_0+8]; +; CHECK-NEXT: ld.param.b64 %rd4, [caller_St8x4_param_0]; +; CHECK-NEXT: st.param.v2.b64 [param0], {%rd4, %rd3}; ; CHECK-NEXT: call.uni (retval0), callee_St8x4, (param0); ; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0]; ; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16]; diff --git a/llvm/tools/spirv-tools/CMakeLists.txt b/llvm/tools/spirv-tools/CMakeLists.txt index c2c0f3e..5db7aec 100644 --- a/llvm/tools/spirv-tools/CMakeLists.txt +++ b/llvm/tools/spirv-tools/CMakeLists.txt @@ -5,10 +5,6 @@ if (NOT LLVM_INCLUDE_SPIRV_TOOLS_TESTS) return() endif () -if (NOT "SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD) - message(FATAL_ERROR "Building SPIRV-Tools tests is unsupported without the SPIR-V target") -endif () - # SPIRV_DIS, SPIRV_VAL, SPIRV_AS and SPIRV_LINK variables can be used to provide paths to existing # spirv-dis, spirv-val, spirv-as, and spirv-link binaries, respectively. Otherwise, build them from # SPIRV-Tools source. diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp index 7c9a546..e288585 100644 --- a/llvm/unittests/Analysis/IR2VecTest.cpp +++ b/llvm/unittests/Analysis/IR2VecTest.cpp @@ -364,9 +364,9 @@ TEST_F(IR2VecTestFixture, GetFunctionVector) { EXPECT_TRUE(FuncVec.approximatelyEquals(Embedding(2, 44.4))); } -static constexpr unsigned MaxOpcodes = 67; -static constexpr unsigned MaxTypeIDs = 21; -static constexpr unsigned MaxOperands = 4; +static constexpr unsigned MaxOpcodes = Vocabulary::MaxOpcodes; +static constexpr unsigned MaxTypeIDs = Vocabulary::MaxTypeIDs; +static constexpr unsigned MaxOperands = Vocabulary::MaxOperandKinds; TEST(IR2VecVocabularyTest, DummyVocabTest) { for (unsigned Dim = 1; Dim <= 10; ++Dim) { diff --git a/llvm/unittests/Support/DebugLogTest.cpp b/llvm/unittests/Support/DebugLogTest.cpp index c34d888..0c464c1 100644 --- a/llvm/unittests/Support/DebugLogTest.cpp +++ b/llvm/unittests/Support/DebugLogTest.cpp @@ -6,7 +6,13 @@ // //===----------------------------------------------------------------------===// +// This macro is defined in the LLVM build system, but we undefine it here +// so that we test at least once in-tree the case where __SHORT_FILE__ is not +// defined. +#undef __SHORT_FILE__ + #include "llvm/Support/DebugLog.h" +#include "llvm/ADT/Sequence.h" #include "llvm/Support/raw_ostream.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -26,7 +32,7 @@ TEST(DebugLogTest, Basic) { { std::string str; raw_string_ostream os(str); - DEBUGLOG_WITH_STREAM_AND_TYPE(os, nullptr) << "NoType"; + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, nullptr) << "NoType"; EXPECT_FALSE(StringRef(os.str()).starts_with('[')); EXPECT_TRUE(StringRef(os.str()).ends_with("NoType\n")); } @@ -35,8 +41,8 @@ TEST(DebugLogTest, Basic) { { std::string str; raw_string_ostream os(str); - DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "A"; - DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << "B"; + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "A"; + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << "B"; EXPECT_TRUE(StringRef(os.str()).starts_with('[')); EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), HasSubstr("B\n"))); } @@ -47,22 +53,55 @@ TEST(DebugLogTest, Basic) { raw_string_ostream os(str); // Just check that the macro doesn't result in dangling else. if (true) - DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "A"; + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "A"; else - DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << "B"; - DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << "B"; + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << "B"; + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << "B"; EXPECT_THAT(os.str(), AllOf(HasSubstr("A\n"), Not(HasSubstr("B\n")))); int count = 0; auto inc = [&]() { return ++count; }; EXPECT_THAT(count, Eq(0)); - DEBUGLOG_WITH_STREAM_AND_TYPE(os, "A") << inc(); + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "A") << inc(); EXPECT_THAT(count, Eq(1)); - DEBUGLOG_WITH_STREAM_AND_TYPE(os, "B") << inc(); + DEBUGLOG_WITH_STREAM_AND_TYPE(os, 0, "B") << inc(); EXPECT_THAT(count, Eq(1)); } } +TEST(DebugLogTest, BasicWithLevel) { + llvm::DebugFlag = true; + // We expect A to be always printed, B to be printed only when level is 1 or + // below, and C to be printed only when level is 0 or below. + static const char *DT[] = {"A", "B:1", "C:"}; + + setCurrentDebugTypes(DT, sizeof(DT) / sizeof(DT[0])); + std::string str; + raw_string_ostream os(str); + for (auto type : {"A", "B", "C", "D"}) + for (int level : llvm::seq<int>(0, 4)) + DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(os, level, type, type, level) + << level; + EXPECT_EQ(os.str(), "[A:0] A:0 0\n[A:1] A:1 1\n[A:2] A:2 2\n[A:3] A:3 " + "3\n[B:0] B:0 0\n[B:1] B:1 1\n[C:0] C:0 0\n"); +} + +TEST(DebugLogTest, NegativeLevel) { + llvm::DebugFlag = true; + // Test the special behavior when all the levels are 0. + // In this case we expect all the debug types to be printed. + static const char *DT[] = {"A:"}; + + setCurrentDebugTypes(DT, sizeof(DT) / sizeof(DT[0])); + std::string str; + raw_string_ostream os(str); + for (auto type : {"A", "B"}) + for (int level : llvm::seq<int>(0, 2)) + DEBUGLOG_WITH_STREAM_TYPE_FILE_AND_LINE(os, level, type, type, level) + << level; + EXPECT_EQ(os.str(), "[A:0] A:0 0\n[B:0] B:0 0\n[B:1] B:1 1\n"); +} + TEST(DebugLogTest, StreamPrefix) { llvm::DebugFlag = true; static const char *DT[] = {"A", "B"}; diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp index 36408de..35927e3 100644 --- a/llvm/unittests/TargetParser/TripleTest.cpp +++ b/llvm/unittests/TargetParser/TripleTest.cpp @@ -758,6 +758,12 @@ TEST(TripleTest, ParsedIDs) { EXPECT_EQ(Triple::UnknownOS, T.getOS()); EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment()); + T = Triple("riscv64-meta-unknown-mtia"); + EXPECT_EQ(Triple::riscv64, T.getArch()); + EXPECT_EQ(Triple::Meta, T.getVendor()); + EXPECT_EQ(Triple::UnknownOS, T.getOS()); + EXPECT_EQ(Triple::MTIA, T.getEnvironment()); + T = Triple("riscv64-unknown-linux"); EXPECT_EQ(Triple::riscv64, T.getArch()); EXPECT_EQ(Triple::UnknownVendor, T.getVendor()); diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn index 8d19295..defa12c 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn @@ -30,6 +30,7 @@ source_set("tweaks") { "MemberwiseConstructor.cpp", "ObjCLocalizeStringLiteral.cpp", "ObjCMemberwiseInitializer.cpp", + "OverridePureVirtuals.cpp", "PopulateSwitch.cpp", "RawStringLiteral.cpp", "RemoveUsingNamespace.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn index 7deefe9..ad32aa9 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn @@ -144,6 +144,7 @@ unittest("ClangdTests") { "tweaks/MemberwiseConstructorTests.cpp", "tweaks/ObjCLocalizeStringLiteralTests.cpp", "tweaks/ObjCMemberwiseInitializerTests.cpp", + "tweaks/OverridePureVirtualsTests.cpp", "tweaks/PopulateSwitchTests.cpp", "tweaks/RawStringLiteralTests.cpp", "tweaks/RemoveUsingNamespaceTests.cpp", diff --git a/llvm/utils/lldbDataFormatters.py b/llvm/utils/lldbDataFormatters.py index c5cd627..7fbeabe6 100644 --- a/llvm/utils/lldbDataFormatters.py +++ b/llvm/utils/lldbDataFormatters.py @@ -94,6 +94,11 @@ def __lldb_init_module(debugger, internal_dict): f"-l {__name__}.ExpectedSynthetic " '-x "^llvm::Expected<.+>$"' ) + debugger.HandleCommand( + "type summary add -w llvm " + f"-F {__name__}.SmallBitVectorSummary " + "llvm::SmallBitVector" + ) # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl @@ -448,3 +453,28 @@ class ExpectedSynthetic: if idx == 0: return self.stored_value return lldb.SBValue() + + +def SmallBitVectorSummary(valobj, _): + underlyingValue = valobj.GetChildMemberWithName("X").unsigned + numBaseBits = valobj.target.addr_size * 8 + smallNumRawBits = numBaseBits - 1 + smallNumSizeBits = None + if numBaseBits == 32: + smallNumSizeBits = 5 + elif numBaseBits == 64: + smallNumSizeBits = 6 + else: + smallNumSizeBits = smallNumRawBits + smallNumDataBits = smallNumRawBits - smallNumSizeBits + + # If our underlying value is not small, print we can not dump large values. + isSmallMask = 1 + if underlyingValue & isSmallMask == 0: + return "<can not read large SmallBitVector>" + + smallRawBits = underlyingValue >> 1 + smallSize = smallRawBits >> smallNumDataBits + bits = smallRawBits & ((1 << (smallSize + 1)) - 1) + # format `bits` in binary (b), with 0 padding, of width `smallSize`, and left aligned (>) + return f"[{bits:0>{smallSize}b}]" |