aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/docs/MLGO.rst2
-rw-r--r--llvm/include/llvm/ADT/Bitset.h21
-rw-r--r--llvm/include/llvm/ADT/ImmutableSet.h2
-rw-r--r--llvm/include/llvm/Analysis/DependenceAnalysis.h8
-rw-r--r--llvm/include/llvm/Analysis/IR2Vec.h61
-rw-r--r--llvm/include/llvm/AsmParser/LLToken.h43
-rw-r--r--llvm/include/llvm/CodeGen/SlotIndexes.h15
-rw-r--r--llvm/include/llvm/IR/ConstantFPRange.h6
-rw-r--r--llvm/include/llvm/IR/PatternMatch.h12
-rw-r--r--llvm/include/llvm/IR/RuntimeLibcalls.h26
-rw-r--r--llvm/include/llvm/Target/GlobalISel/Combine.td2
-rw-r--r--llvm/include/llvm/Target/TargetSelectionDAG.td2
-rw-r--r--llvm/include/llvm/Transforms/Scalar/GVN.h12
-rw-r--r--llvm/include/llvm/Transforms/Utils/Local.h11
-rw-r--r--llvm/include/llvm/Transforms/Utils/MisExpect.h22
-rw-r--r--llvm/lib/Analysis/DependenceAnalysis.cpp32
-rw-r--r--llvm/lib/Analysis/IR2Vec.cpp180
-rw-r--r--llvm/lib/Analysis/InstructionSimplify.cpp10
-rw-r--r--llvm/lib/Analysis/TargetLibraryInfo.cpp5
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp1
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp49
-rw-r--r--llvm/lib/BinaryFormat/Dwarf.cpp2
-rw-r--r--llvm/lib/Bitcode/Reader/MetadataLoader.cpp16
-rw-r--r--llvm/lib/Bitcode/Writer/BitcodeWriter.cpp8
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp10
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp6
-rw-r--r--llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp1
-rw-r--r--llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp16
-rw-r--r--llvm/lib/CodeGen/RegisterCoalescer.cpp6
-rw-r--r--llvm/lib/DebugInfo/DWARF/DWARFDie.cpp3
-rw-r--r--llvm/lib/IR/AsmWriter.cpp14
-rw-r--r--llvm/lib/IR/ConstantFPRange.cpp20
-rw-r--r--llvm/lib/Target/AArch64/AArch64Combine.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp81
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp91
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h6
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td8
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatterns.td21
-rw-r--r--llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp2
-rw-r--r--llvm/lib/Target/Mips/MipsInstrInfo.td16
-rw-r--r--llvm/lib/Target/RISCV/RISCVFeatures.td7
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td23
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrPredicates.td4
-rw-r--r--llvm/lib/Target/RISCV/RISCVProcessors.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVSchedSiFive7.td142
-rw-r--r--llvm/lib/Target/RISCV/RISCVScheduleV.td16
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp1
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp1
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp1
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp2
-rw-r--r--llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp8
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp8
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp24
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp5
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h1
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp37
-rw-r--r--llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp4
-rw-r--r--llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp192
-rw-r--r--llvm/lib/Transforms/Scalar/EarlyCSE.cpp20
-rw-r--r--llvm/lib/Transforms/Scalar/GVN.cpp218
-rw-r--r--llvm/lib/Transforms/Scalar/NewGVN.cpp9
-rw-r--r--llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp10
-rw-r--r--llvm/lib/Transforms/Utils/Local.cpp16
-rw-r--r--llvm/lib/Transforms/Utils/LowerInvoke.cpp26
-rw-r--r--llvm/lib/Transforms/Utils/MisExpect.cpp61
-rw-r--r--llvm/lib/Transforms/Utils/SimplifyCFG.cpp10
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll2
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/SameSDLoops.ll8
-rw-r--r--llvm/test/Analysis/IR2Vec/unreachable.ll8
-rw-r--r--llvm/test/Assembler/dicompileunit-conflicting-language-fields.ll4
-rw-r--r--llvm/test/Assembler/dicompileunit-invalid-language.ll22
-rw-r--r--llvm/test/Assembler/invalid-dicompileunit-missing-language.ll2
-rw-r--r--llvm/test/Bitcode/dwarf-source-language-name.ll15
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir137
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir137
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll10
-rw-r--r--llvm/test/CodeGen/AArch64/f16-instructions.ll18
-rw-r--r--llvm/test/CodeGen/AArch64/fcvt-fixed.ll561
-rw-r--r--llvm/test/CodeGen/AArch64/frem-power2.ll3
-rw-r--r--llvm/test/CodeGen/AArch64/pr58431.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/sve-asrd.ll53
-rw-r--r--llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll296
-rw-r--r--llvm/test/CodeGen/AArch64/sve-vector-interleave.ll312
-rw-r--r--llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll52
-rw-r--r--llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll3215
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir17
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll325
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll344
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll108
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll42
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll110
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir34
-rw-r--r--llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll138
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmed3.ll46
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll72
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.exp10.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll76
-rw-r--r--llvm/test/CodeGen/AMDGPU/maximumnum.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-loop.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/minimumnum.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/regpressure_printer.mir18
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-select.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll51
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll52
-rw-r--r--llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll52
-rw-r--r--llvm/test/CodeGen/ARM/sincos.ll27
-rw-r--r--llvm/test/CodeGen/RISCV/cmov-branch-opt.ll5
-rw-r--r--llvm/test/CodeGen/RISCV/features-info.ll1
-rw-r--r--llvm/test/CodeGen/RISCV/xqcicli.ll36
-rw-r--r--llvm/test/CodeGen/RISCV/xqcics.ll42
-rw-r--r--llvm/test/CodeGen/X86/fast-isel-fneg.ll101
-rw-r--r--llvm/test/CodeGen/X86/fp-int-fp-cvt.ll240
-rw-r--r--llvm/test/CodeGen/X86/isel-fneg.ll208
-rw-r--r--llvm/test/CodeGen/X86/stack-protector-target.ll12
-rw-r--r--llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll5
-rw-r--r--llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll7
-rw-r--r--llvm/test/MC/Mips/branch-pseudos-bad.s8
-rw-r--r--llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td21
-rw-r--r--llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td61
-rw-r--r--llvm/test/TableGen/RuntimeLibcallEmitter.td36
-rw-r--r--llvm/test/Transforms/CodeGenPrepare/X86/baseoffs-sext-bug.ll81
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll42
-rw-r--r--llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll281
-rw-r--r--llvm/test/Transforms/GVN/condprop.ll2
-rw-r--r--llvm/test/Transforms/InstCombine/ptrtoaddr.ll38
-rw-r--r--llvm/test/Transforms/InstCombine/select-safe-transforms.ll24
-rw-r--r--llvm/test/Transforms/InstCombine/sprintf-1.ll3
-rw-r--r--llvm/test/Transforms/InstSimplify/ptr_diff.ll6
-rw-r--r--llvm/test/Transforms/SafeStack/X86/abi_ssp.ll8
-rw-r--r--llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll23
-rw-r--r--llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_name.s35
-rw-r--r--llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s35
-rw-r--r--llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s152
-rw-r--r--llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp16
-rw-r--r--llvm/unittests/ADT/BitsetTest.cpp71
-rw-r--r--llvm/unittests/ADT/CMakeLists.txt1
-rw-r--r--llvm/unittests/Analysis/IR2VecTest.cpp92
-rw-r--r--llvm/unittests/IR/ConstantFPRangeTest.cpp35
-rw-r--r--llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp31
-rw-r--r--llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn2
-rw-r--r--llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn16
-rw-r--r--llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn1
167 files changed, 5968 insertions, 4346 deletions
diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index 965a21b..bf3de11 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -508,7 +508,7 @@ embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance.
.. code-block:: c++
- const ir2vec::Embedding &FuncVector = Emb->getFunctionVector();
+ ir2vec::Embedding FuncVector = Emb->getFunctionVector();
Currently, ``Embedder`` can generate embeddings at three levels: Instructions,
Basic Blocks, and Functions. Appropriate getters are provided to access the
diff --git a/llvm/include/llvm/ADT/Bitset.h b/llvm/include/llvm/ADT/Bitset.h
index b1e539e..1d4cbf8 100644
--- a/llvm/include/llvm/ADT/Bitset.h
+++ b/llvm/include/llvm/ADT/Bitset.h
@@ -38,14 +38,27 @@ class Bitset {
static constexpr unsigned NumWords =
(NumBits + BitwordBits - 1) / BitwordBits;
-protected:
using StorageType = std::array<BitWord, NumWords>;
-
-private:
StorageType Bits{};
protected:
- constexpr Bitset(const StorageType &B) : Bits{B} {}
+ constexpr Bitset(const std::array<uint64_t, (NumBits + 63) / 64> &B) {
+ if constexpr (sizeof(BitWord) == sizeof(uint64_t)) {
+ for (size_t I = 0; I != B.size(); ++I)
+ Bits[I] = B[I];
+ } else {
+ unsigned BitsToAssign = NumBits;
+ for (size_t I = 0; I != B.size() && BitsToAssign; ++I) {
+ uint64_t Elt = B[I];
+ // On a 32-bit system the storage type will be 32-bit, so we may only
+ // need half of a uint64_t.
+ for (size_t offset = 0; offset != 2 && BitsToAssign; ++offset) {
+ Bits[2 * I + offset] = static_cast<uint32_t>(Elt >> (32 * offset));
+ BitsToAssign = BitsToAssign >= 32 ? BitsToAssign - 32 : 0;
+ }
+ }
+ }
+ }
public:
constexpr Bitset() = default;
diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h
index 017585a4..310539f 100644
--- a/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/llvm/include/llvm/ADT/ImmutableSet.h
@@ -21,7 +21,9 @@
#include "llvm/ADT/iterator.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Signals.h"
#include <cassert>
#include <cstdint>
#include <functional>
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index a2ca695..18a8f8a 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -160,7 +160,7 @@ public:
/// getDVEntry - Returns the DV entry associated with a regular or a
/// SameSD level
- DVEntry getDVEntry(unsigned Level, bool isSameSD) const;
+ DVEntry getDVEntry(unsigned Level, bool IsSameSD) const;
/// getDirection - Returns the direction associated with a particular
/// common or SameSD level.
@@ -234,7 +234,7 @@ public:
/// dumpImp - For debugging purposes. Dumps a dependence to OS with or
/// without considering the SameSD levels.
- void dumpImp(raw_ostream &OS, bool SameSD = false) const;
+ void dumpImp(raw_ostream &OS, bool IsSameSD = false) const;
protected:
Instruction *Src, *Dst;
@@ -282,8 +282,8 @@ public:
/// getDVEntry - Returns the DV entry associated with a regular or a
/// SameSD level.
- DVEntry getDVEntry(unsigned Level, bool isSameSD) const {
- if (!isSameSD) {
+ DVEntry getDVEntry(unsigned Level, bool IsSameSD) const {
+ if (!IsSameSD) {
assert(0 < Level && Level <= Levels && "Level out of range");
return DV[Level - 1];
} else {
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 81409df..6bc51fe 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -533,21 +533,20 @@ protected:
/// in the IR instructions to generate the vector representation.
const float OpcWeight, TypeWeight, ArgWeight;
- // Utility maps - these are used to store the vector representations of
- // instructions, basic blocks and functions.
- mutable Embedding FuncVector;
- mutable BBEmbeddingsMap BBVecMap;
- mutable InstEmbeddingsMap InstVecMap;
-
- LLVM_ABI Embedder(const Function &F, const Vocabulary &Vocab);
+ LLVM_ABI Embedder(const Function &F, const Vocabulary &Vocab)
+ : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()),
+ OpcWeight(ir2vec::OpcWeight), TypeWeight(ir2vec::TypeWeight),
+ ArgWeight(ir2vec::ArgWeight) {}
- /// Function to compute embeddings. It generates embeddings for all
- /// the instructions and basic blocks in the function F.
- void computeEmbeddings() const;
+ /// Function to compute embeddings.
+ Embedding computeEmbeddings() const;
/// Function to compute the embedding for a given basic block.
+ Embedding computeEmbeddings(const BasicBlock &BB) const;
+
+ /// Function to compute the embedding for a given instruction.
/// Specific to the kind of embeddings being computed.
- virtual void computeEmbeddings(const BasicBlock &BB) const = 0;
+ virtual Embedding computeEmbeddings(const Instruction &I) const = 0;
public:
virtual ~Embedder() = default;
@@ -556,23 +555,27 @@ public:
LLVM_ABI static std::unique_ptr<Embedder>
create(IR2VecKind Mode, const Function &F, const Vocabulary &Vocab);
- /// Returns a map containing instructions and the corresponding embeddings for
- /// the function F if it has been computed. If not, it computes the embeddings
- /// for the function and returns the map.
- LLVM_ABI const InstEmbeddingsMap &getInstVecMap() const;
-
- /// Returns a map containing basic block and the corresponding embeddings for
- /// the function F if it has been computed. If not, it computes the embeddings
- /// for the function and returns the map.
- LLVM_ABI const BBEmbeddingsMap &getBBVecMap() const;
+ /// Computes and returns the embedding for a given instruction in the function
+ /// F
+ LLVM_ABI Embedding getInstVector(const Instruction &I) const {
+ return computeEmbeddings(I);
+ }
- /// Returns the embedding for a given basic block in the function F if it has
- /// been computed. If not, it computes the embedding for the basic block and
- /// returns it.
- LLVM_ABI const Embedding &getBBVector(const BasicBlock &BB) const;
+ /// Computes and returns the embedding for a given basic block in the function
+ /// F
+ LLVM_ABI Embedding getBBVector(const BasicBlock &BB) const {
+ return computeEmbeddings(BB);
+ }
/// Computes and returns the embedding for the current function.
- LLVM_ABI const Embedding &getFunctionVector() const;
+ LLVM_ABI Embedding getFunctionVector() const { return computeEmbeddings(); }
+
+ /// Invalidate embeddings if cached. The embeddings may not be relevant
+ /// anymore when the IR changes due to transformations. In such cases, the
+ /// cached embeddings should be invalidated to ensure
+ /// correctness/recomputation. This is a no-op for SymbolicEmbedder but
+ /// removes all the cached entries in FlowAwareEmbedder.
+ virtual void invalidateEmbeddings() { return; }
};
/// Class for computing the Symbolic embeddings of IR2Vec.
@@ -580,7 +583,7 @@ public:
/// representations obtained from the Vocabulary.
class LLVM_ABI SymbolicEmbedder : public Embedder {
private:
- void computeEmbeddings(const BasicBlock &BB) const override;
+ Embedding computeEmbeddings(const Instruction &I) const override;
public:
SymbolicEmbedder(const Function &F, const Vocabulary &Vocab)
@@ -592,11 +595,15 @@ public:
/// embeddings, and additionally capture the flow information in the IR.
class LLVM_ABI FlowAwareEmbedder : public Embedder {
private:
- void computeEmbeddings(const BasicBlock &BB) const override;
+ // FlowAware embeddings would benefit from caching instruction embeddings as
+ // they are reused while computing the embeddings of other instructions.
+ mutable InstEmbeddingsMap InstVecMap;
+ Embedding computeEmbeddings(const Instruction &I) const override;
public:
FlowAwareEmbedder(const Function &F, const Vocabulary &Vocab)
: Embedder(F, Vocab) {}
+ void invalidateEmbeddings() override { InstVecMap.clear(); }
};
} // namespace ir2vec
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index d976d40e..6de99fe 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -488,27 +488,28 @@ enum Kind {
SummaryID, // ^42
// String valued tokens (StrVal).
- LabelStr, // foo:
- GlobalVar, // @foo @"foo"
- ComdatVar, // $foo
- LocalVar, // %foo %"foo"
- MetadataVar, // !foo
- StringConstant, // "foo"
- DwarfTag, // DW_TAG_foo
- DwarfAttEncoding, // DW_ATE_foo
- DwarfVirtuality, // DW_VIRTUALITY_foo
- DwarfLang, // DW_LANG_foo
- DwarfCC, // DW_CC_foo
- EmissionKind, // lineTablesOnly
- NameTableKind, // GNU
- FixedPointKind, // Fixed point
- DwarfOp, // DW_OP_foo
- DIFlag, // DIFlagFoo
- DISPFlag, // DISPFlagFoo
- DwarfMacinfo, // DW_MACINFO_foo
- ChecksumKind, // CSK_foo
- DbgRecordType, // dbg_foo
- DwarfEnumKind, // DW_APPLE_ENUM_KIND_foo
+ LabelStr, // foo:
+ GlobalVar, // @foo @"foo"
+ ComdatVar, // $foo
+ LocalVar, // %foo %"foo"
+ MetadataVar, // !foo
+ StringConstant, // "foo"
+ DwarfTag, // DW_TAG_foo
+ DwarfAttEncoding, // DW_ATE_foo
+ DwarfVirtuality, // DW_VIRTUALITY_foo
+ DwarfLang, // DW_LANG_foo
+ DwarfSourceLangName, // DW_LNAME_foo
+ DwarfCC, // DW_CC_foo
+ EmissionKind, // lineTablesOnly
+ NameTableKind, // GNU
+ FixedPointKind, // Fixed point
+ DwarfOp, // DW_OP_foo
+ DIFlag, // DIFlagFoo
+ DISPFlag, // DISPFlagFoo
+ DwarfMacinfo, // DW_MACINFO_foo
+ ChecksumKind, // CSK_foo
+ DbgRecordType, // dbg_foo
+ DwarfEnumKind, // DW_APPLE_ENUM_KIND_foo
// Type valued tokens (TyVal).
Type,
diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h
index 1e270b4..780bd82 100644
--- a/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -467,16 +467,27 @@ class raw_ostream;
return getMBBRange(mbb).first;
}
- /// Returns the last index in the given basic block number.
+ /// Returns the index past the last valid index in the given basic block.
SlotIndex getMBBEndIdx(unsigned Num) const {
return getMBBRange(Num).second;
}
- /// Returns the last index in the given basic block.
+ /// Returns the index past the last valid index in the given basic block.
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const {
return getMBBRange(mbb).second;
}
+ /// Returns the last valid index in the given basic block.
+ /// This index corresponds to the dead slot of the last non-debug
+ /// instruction and can be used to find live-out ranges of the block. Note
+ /// that getMBBEndIdx returns the start index of the next block, which is
+ /// also used as the start index for segments with phi-def values. If the
+ /// basic block doesn't contain any non-debug instructions, this returns
+ /// the same as getMBBStartIdx.getDeadSlot().
+ SlotIndex getMBBLastIdx(const MachineBasicBlock *MBB) const {
+ return getMBBEndIdx(MBB).getPrevSlot();
+ }
+
/// Iterator over the idx2MBBMap (sorted pairs of slot index of basic block
/// begin and basic block)
using MBBIndexIterator = SmallVectorImpl<IdxMBBPair>::const_iterator;
diff --git a/llvm/include/llvm/IR/ConstantFPRange.h b/llvm/include/llvm/IR/ConstantFPRange.h
index 930c6f9..4a54caa 100644
--- a/llvm/include/llvm/IR/ConstantFPRange.h
+++ b/llvm/include/llvm/IR/ConstantFPRange.h
@@ -200,6 +200,12 @@ public:
/// with another range. The resultant range is guaranteed to include the
/// elements of both sets, but may contain more.
LLVM_ABI ConstantFPRange unionWith(const ConstantFPRange &CR) const;
+
+ /// Calculate absolute value range.
+ LLVM_ABI ConstantFPRange abs() const;
+
+ /// Calculate range of negated values.
+ LLVM_ABI ConstantFPRange negate() const;
};
inline raw_ostream &operator<<(raw_ostream &OS, const ConstantFPRange &CR) {
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 2e31fe5..7934277 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2184,6 +2184,18 @@ inline PtrToIntSameSize_match<OpTy> m_PtrToIntSameSize(const DataLayout &DL,
return PtrToIntSameSize_match<OpTy>(DL, Op);
}
+/// Matches PtrToAddr.
+template <typename OpTy>
+inline CastOperator_match<OpTy, Instruction::PtrToAddr>
+m_PtrToAddr(const OpTy &Op) {
+ return CastOperator_match<OpTy, Instruction::PtrToAddr>(Op);
+}
+
+/// Matches PtrToInt or PtrToAddr.
+template <typename OpTy> inline auto m_PtrToIntOrAddr(const OpTy &Op) {
+ return m_CombineOr(m_PtrToInt(Op), m_PtrToAddr(Op));
+}
+
/// Matches IntToPtr.
template <typename OpTy>
inline CastOperator_match<OpTy, Instruction::IntToPtr>
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 93183bc..307cc66 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -15,6 +15,7 @@
#define LLVM_IR_RUNTIME_LIBCALLS_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Bitset.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/StringTable.h"
#include "llvm/IR/CallingConv.h"
@@ -54,8 +55,22 @@ static inline auto libcall_impls() {
static_cast<RTLIB::LibcallImpl>(RTLIB::NumLibcallImpls));
}
+/// Manage a bitset representing the list of available libcalls for a module.
+class LibcallImplBitset : public Bitset<RTLIB::NumLibcallImpls> {
+public:
+ constexpr LibcallImplBitset() = default;
+ constexpr LibcallImplBitset(
+ const std::array<uint64_t, (RTLIB::NumLibcallImpls + 63) / 64> &Src)
+ : Bitset(Src) {}
+};
+
/// A simple container for information about the supported runtime calls.
struct RuntimeLibcallsInfo {
+private:
+ /// Bitset of libcalls a module may emit a call to.
+ LibcallImplBitset AvailableLibcallImpls;
+
+public:
explicit RuntimeLibcallsInfo(
const Triple &TT,
ExceptionHandling ExceptionModel = ExceptionHandling::None,
@@ -129,6 +144,14 @@ struct RuntimeLibcallsInfo {
return getLibcallName(RTLIB::MEMMOVE);
}
+ bool isAvailable(RTLIB::LibcallImpl Impl) const {
+ return AvailableLibcallImpls.test(Impl);
+ }
+
+ void setAvailable(RTLIB::LibcallImpl Impl) {
+ AvailableLibcallImpls.set(Impl);
+ }
+
/// Return the libcall provided by \p Impl
static RTLIB::Libcall getLibcallFromImpl(RTLIB::LibcallImpl Impl) {
return ImplToLibcall[Impl];
@@ -225,8 +248,7 @@ private:
/// Return true if the target has sincosf/sincos/sincosl functions
static bool hasSinCos(const Triple &TT) {
- return TT.isGNUEnvironment() || TT.isOSFuchsia() ||
- (TT.isAndroid() && !TT.isAndroidVersionLT(9));
+ return TT.isGNUEnvironment() || TT.isOSFuchsia() || TT.isAndroid();
}
static bool hasSinCos_f32_f64(const Triple &TT) {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index e2b7a5e..3d21f52 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -695,6 +695,7 @@ def constant_fold_fabs : constant_fold_unary_fp_op_rule<G_FABS>;
def constant_fold_fsqrt : constant_fold_unary_fp_op_rule<G_FSQRT>;
def constant_fold_flog2 : constant_fold_unary_fp_op_rule<G_FLOG2>;
def constant_fold_fptrunc : constant_fold_unary_fp_op_rule<G_FPTRUNC>;
+def constant_fold_fpext : constant_fold_unary_fp_op_rule<G_FPEXT>;
// Fold constant zero int to fp conversions.
class itof_const_zero_fold_rule<Instruction opcode> : GICombineRule <
@@ -713,6 +714,7 @@ def constant_fold_fp_ops : GICombineGroup<[
constant_fold_fsqrt,
constant_fold_flog2,
constant_fold_fptrunc,
+ constant_fold_fpext,
itof_const_zero_fold_si,
itof_const_zero_fold_ui
]>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 7bc90d4..774063b 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -116,7 +116,7 @@ def SDTIntBinOp : SDTypeProfile<1, 2, [ // add, and, or, xor, udiv, etc.
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>
]>;
def SDTIntShiftOp : SDTypeProfile<1, 2, [ // shl, sra, srl
- SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>
+ SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisInt<2>, SDTCisSameNumEltsAs<0, 2>
]>;
def SDTIntShiftPairOp : SDTypeProfile<2, 3, [ // shl_parts, sra_parts, srl_parts
SDTCisInt<0>, SDTCisSameAs<1, 0>,
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 74a4d6c..bc0f108 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -28,6 +28,7 @@
#include <cstdint>
#include <optional>
#include <utility>
+#include <variant>
#include <vector>
namespace llvm {
@@ -322,11 +323,6 @@ private:
};
LeaderMap LeaderTable;
- // Block-local map of equivalent values to their leader, does not
- // propagate to any successors. Entries added mid-block are applied
- // to the remaining instructions in the block.
- SmallMapVector<Value *, Value *, 4> ReplaceOperandsWithMap;
-
// Map the block to reversed postorder traversal number. It is used to
// find back edge easily.
DenseMap<AssertingVH<BasicBlock>, uint32_t> BlockRPONumber;
@@ -402,9 +398,9 @@ private:
void verifyRemoved(const Instruction *I) const;
bool splitCriticalEdges();
BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
- bool replaceOperandsForInBlockEquality(Instruction *I) const;
- bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
- bool DominatesByEdge);
+ bool
+ propagateEquality(Value *LHS, Value *RHS,
+ const std::variant<BasicBlockEdge, Instruction *> &Root);
bool processFoldableCondBr(BranchInst *BI);
void addDeadBlock(BasicBlock *BB);
void assignValNumForDeadCode();
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 3f5f427..9acfd87 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -452,6 +452,11 @@ LLVM_ABI unsigned replaceDominatedUsesWith(Value *From, Value *To,
LLVM_ABI unsigned replaceDominatedUsesWith(Value *From, Value *To,
DominatorTree &DT,
const BasicBlock *BB);
+/// Replace each use of 'From' with 'To' if that use is dominated by the
+/// given instruction. Returns the number of replacements made.
+LLVM_ABI unsigned replaceDominatedUsesWith(Value *From, Value *To,
+ DominatorTree &DT,
+ const Instruction *I);
/// Replace each use of 'From' with 'To' if that use is dominated by
/// the given edge and the callback ShouldReplace returns true. Returns the
/// number of replacements made.
@@ -464,6 +469,12 @@ LLVM_ABI unsigned replaceDominatedUsesWithIf(
LLVM_ABI unsigned replaceDominatedUsesWithIf(
Value *From, Value *To, DominatorTree &DT, const BasicBlock *BB,
function_ref<bool(const Use &U, const Value *To)> ShouldReplace);
+/// Replace each use of 'From' with 'To' if that use is dominated by
+/// the given instruction and the callback ShouldReplace returns true. Returns
+/// the number of replacements made.
+LLVM_ABI unsigned replaceDominatedUsesWithIf(
+ Value *From, Value *To, DominatorTree &DT, const Instruction *I,
+ function_ref<bool(const Use &U, const Value *To)> ShouldReplace);
/// Return true if this call calls a gc leaf function.
///
diff --git a/llvm/include/llvm/Transforms/Utils/MisExpect.h b/llvm/include/llvm/Transforms/Utils/MisExpect.h
index e9fba47..2d35d92 100644
--- a/llvm/include/llvm/Transforms/Utils/MisExpect.h
+++ b/llvm/include/llvm/Transforms/Utils/MisExpect.h
@@ -22,8 +22,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
-namespace llvm {
-namespace misexpect {
+namespace llvm::misexpect {
/// checkBackendInstrumentation - compares PGO counters to the thresholds used
/// for llvm.expect and warns if the PGO counters are outside of the expected
@@ -34,8 +33,8 @@ namespace misexpect {
///
/// \param I The Instruction being checked
/// \param RealWeights A vector of profile weights for each target block
-void checkBackendInstrumentation(Instruction &I,
- const llvm::ArrayRef<uint32_t> RealWeights);
+void checkBackendInstrumentation(const Instruction &I,
+ ArrayRef<uint32_t> RealWeights);
/// checkFrontendInstrumentation - compares PGO counters to the thresholds used
/// for llvm.expect and warns if the PGO counters are outside of the expected
@@ -48,8 +47,8 @@ void checkBackendInstrumentation(Instruction &I,
/// \param I The Instruction being checked
/// \param ExpectedWeights A vector of the expected weights for each target
/// block, this determines the threshold values used when emitting diagnostics
-void checkFrontendInstrumentation(Instruction &I,
- const ArrayRef<uint32_t> ExpectedWeights);
+void checkFrontendInstrumentation(const Instruction &I,
+ ArrayRef<uint32_t> ExpectedWeights);
/// veryifyMisExpect - compares RealWeights to the thresholds used
/// for llvm.expect and warns if the PGO counters are outside of the expected
@@ -58,8 +57,8 @@ void checkFrontendInstrumentation(Instruction &I,
/// \param I The Instruction being checked
/// \param RealWeights A vector of profile weights from the profile data
/// \param ExpectedWeights A vector of the weights attatch by llvm.expect
-void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
- const ArrayRef<uint32_t> ExpectedWeights);
+void verifyMisExpect(const Instruction &I, ArrayRef<uint32_t> RealWeights,
+ ArrayRef<uint32_t> ExpectedWeights);
/// checkExpectAnnotations - compares PGO counters to the thresholds used
/// for llvm.expect and warns if the PGO counters are outside of the expected
@@ -72,11 +71,10 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
/// \param I The Instruction being checked
/// \param ExistingWeights A vector of profile weights for each target block
/// \param IsFrontend A boolean describing if this is Frontend instrumentation
-void checkExpectAnnotations(Instruction &I,
- const ArrayRef<uint32_t> ExistingWeights,
+void checkExpectAnnotations(const Instruction &I,
+ ArrayRef<uint32_t> ExistingWeights,
bool IsFrontend);
-} // namespace misexpect
-} // namespace llvm
+} // namespace llvm::misexpect
#endif
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 1f0da8d1..8d20b0e 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -275,7 +275,7 @@ bool Dependence::isAnti() const {
// if no subscript in the source or destination mention the induction
// variable associated with the loop at this level.
// Leave this out of line, so it will serve as a virtual method anchor
-bool Dependence::isScalar(unsigned level, bool isSameSD) const { return false; }
+bool Dependence::isScalar(unsigned level, bool IsSameSD) const { return false; }
//===----------------------------------------------------------------------===//
// FullDependence methods
@@ -351,38 +351,38 @@ bool FullDependence::normalize(ScalarEvolution *SE) {
// getDirection - Returns the direction associated with a particular common or
// SameSD level.
-unsigned FullDependence::getDirection(unsigned Level, bool isSameSD) const {
- return getDVEntry(Level, isSameSD).Direction;
+unsigned FullDependence::getDirection(unsigned Level, bool IsSameSD) const {
+ return getDVEntry(Level, IsSameSD).Direction;
}
// Returns the distance (or NULL) associated with a particular common or
// SameSD level.
-const SCEV *FullDependence::getDistance(unsigned Level, bool isSameSD) const {
- return getDVEntry(Level, isSameSD).Distance;
+const SCEV *FullDependence::getDistance(unsigned Level, bool IsSameSD) const {
+ return getDVEntry(Level, IsSameSD).Distance;
}
// Returns true if a particular regular or SameSD level is scalar; that is,
// if no subscript in the source or destination mention the induction variable
// associated with the loop at this level.
-bool FullDependence::isScalar(unsigned Level, bool isSameSD) const {
- return getDVEntry(Level, isSameSD).Scalar;
+bool FullDependence::isScalar(unsigned Level, bool IsSameSD) const {
+ return getDVEntry(Level, IsSameSD).Scalar;
}
// Returns true if peeling the first iteration from this regular or SameSD
// loop level will break this dependence.
-bool FullDependence::isPeelFirst(unsigned Level, bool isSameSD) const {
- return getDVEntry(Level, isSameSD).PeelFirst;
+bool FullDependence::isPeelFirst(unsigned Level, bool IsSameSD) const {
+ return getDVEntry(Level, IsSameSD).PeelFirst;
}
// Returns true if peeling the last iteration from this regular or SameSD
// loop level will break this dependence.
-bool FullDependence::isPeelLast(unsigned Level, bool isSameSD) const {
- return getDVEntry(Level, isSameSD).PeelLast;
+bool FullDependence::isPeelLast(unsigned Level, bool IsSameSD) const {
+ return getDVEntry(Level, IsSameSD).PeelLast;
}
// Returns true if splitting loop will break the dependence.
-bool FullDependence::isSplitable(unsigned Level, bool isSameSD) const {
- return getDVEntry(Level, isSameSD).Splitable;
+bool FullDependence::isSplitable(unsigned Level, bool IsSameSD) const {
+ return getDVEntry(Level, IsSameSD).Splitable;
}
// inSameSDLoops - Returns true if this level is an SameSD level, i.e.,
@@ -691,7 +691,7 @@ void Dependence::dump(raw_ostream &OS) const {
dumpImp(OS);
unsigned SameSDLevels = getSameSDLevels();
if (SameSDLevels > 0) {
- OS << "! / assuming " << SameSDLevels << " loop level(s) fused: ";
+ OS << " / assuming " << SameSDLevels << " loop level(s) fused: ";
dumpImp(OS, true);
}
}
@@ -706,13 +706,13 @@ void Dependence::dump(raw_ostream &OS) const {
// For debugging purposes. Dumps a dependence to OS with or without considering
// the SameSD levels.
-void Dependence::dumpImp(raw_ostream &OS, bool isSameSD) const {
+void Dependence::dumpImp(raw_ostream &OS, bool IsSameSD) const {
bool Splitable = false;
unsigned Levels = getLevels();
unsigned SameSDLevels = getSameSDLevels();
bool OnSameSD = false;
unsigned LevelNum = Levels;
- if (isSameSD)
+ if (IsSameSD)
LevelNum += SameSDLevels;
OS << " [";
for (unsigned II = 1; II <= LevelNum; ++II) {
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 1794a60..85b5372 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -153,11 +153,6 @@ void Embedding::print(raw_ostream &OS) const {
// Embedder and its subclasses
//===----------------------------------------------------------------------===//
-Embedder::Embedder(const Function &F, const Vocabulary &Vocab)
- : F(F), Vocab(Vocab), Dimension(Vocab.getDimension()),
- OpcWeight(::OpcWeight), TypeWeight(::TypeWeight), ArgWeight(::ArgWeight),
- FuncVector(Embedding(Dimension)) {}
-
std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
const Vocabulary &Vocab) {
switch (Mode) {
@@ -169,110 +164,85 @@ std::unique_ptr<Embedder> Embedder::create(IR2VecKind Mode, const Function &F,
return nullptr;
}
-const InstEmbeddingsMap &Embedder::getInstVecMap() const {
- if (InstVecMap.empty())
- computeEmbeddings();
- return InstVecMap;
-}
-
-const BBEmbeddingsMap &Embedder::getBBVecMap() const {
- if (BBVecMap.empty())
- computeEmbeddings();
- return BBVecMap;
-}
-
-const Embedding &Embedder::getBBVector(const BasicBlock &BB) const {
- auto It = BBVecMap.find(&BB);
- if (It != BBVecMap.end())
- return It->second;
- computeEmbeddings(BB);
- return BBVecMap[&BB];
-}
+Embedding Embedder::computeEmbeddings() const {
+ Embedding FuncVector(Dimension, 0.0);
-const Embedding &Embedder::getFunctionVector() const {
- // Currently, we always (re)compute the embeddings for the function.
- // This is cheaper than caching the vector.
- computeEmbeddings();
- return FuncVector;
-}
-
-void Embedder::computeEmbeddings() const {
if (F.isDeclaration())
- return;
-
- FuncVector = Embedding(Dimension, 0.0);
+ return FuncVector;
// Consider only the basic blocks that are reachable from entry
- for (const BasicBlock *BB : depth_first(&F)) {
- computeEmbeddings(*BB);
- FuncVector += BBVecMap[BB];
- }
+ for (const BasicBlock *BB : depth_first(&F))
+ FuncVector += computeEmbeddings(*BB);
+ return FuncVector;
}
-void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
+Embedding Embedder::computeEmbeddings(const BasicBlock &BB) const {
Embedding BBVector(Dimension, 0);
// We consider only the non-debug and non-pseudo instructions
- for (const auto &I : BB.instructionsWithoutDebug()) {
- Embedding ArgEmb(Dimension, 0);
- for (const auto &Op : I.operands())
- ArgEmb += Vocab[*Op];
- auto InstVector =
- Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
- if (const auto *IC = dyn_cast<CmpInst>(&I))
- InstVector += Vocab[IC->getPredicate()];
- InstVecMap[&I] = InstVector;
- BBVector += InstVector;
- }
- BBVecMap[&BB] = BBVector;
-}
-
-void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
- Embedding BBVector(Dimension, 0);
+ for (const auto &I : BB.instructionsWithoutDebug())
+ BBVector += computeEmbeddings(I);
+ return BBVector;
+}
+
+Embedding SymbolicEmbedder::computeEmbeddings(const Instruction &I) const {
+ // Currently, we always (re)compute the embeddings for symbolic embedder.
+ // This is cheaper than caching the vectors.
+ Embedding ArgEmb(Dimension, 0);
+ for (const auto &Op : I.operands())
+ ArgEmb += Vocab[*Op];
+ auto InstVector =
+ Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+ if (const auto *IC = dyn_cast<CmpInst>(&I))
+ InstVector += Vocab[IC->getPredicate()];
+ return InstVector;
+}
+
+Embedding FlowAwareEmbedder::computeEmbeddings(const Instruction &I) const {
+ // If we have already computed the embedding for this instruction, return it
+ auto It = InstVecMap.find(&I);
+ if (It != InstVecMap.end())
+ return It->second;
- // We consider only the non-debug and non-pseudo instructions
- for (const auto &I : BB.instructionsWithoutDebug()) {
- // TODO: Handle call instructions differently.
- // For now, we treat them like other instructions
- Embedding ArgEmb(Dimension, 0);
- for (const auto &Op : I.operands()) {
- // If the operand is defined elsewhere, we use its embedding
- if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
- auto DefIt = InstVecMap.find(DefInst);
- // Fixme (#159171): Ideally we should never miss an instruction
- // embedding here.
- // But when we have cyclic dependencies (e.g., phi
- // nodes), we might miss the embedding. In such cases, we fall back to
- // using the vocabulary embedding. This can be fixed by iterating to a
- // fixed-point, or by using a simple solver for the set of simultaneous
- // equations.
- // Another case when we might miss an instruction embedding is when
- // the operand instruction is in a different basic block that has not
- // been processed yet. This can be fixed by processing the basic blocks
- // in a topological order.
- if (DefIt != InstVecMap.end())
- ArgEmb += DefIt->second;
- else
- ArgEmb += Vocab[*Op];
- }
- // If the operand is not defined by an instruction, we use the vocabulary
- else {
- LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
- << *Op << "=" << Vocab[*Op][0] << "\n");
+ // TODO: Handle call instructions differently.
+ // For now, we treat them like other instructions
+ Embedding ArgEmb(Dimension, 0);
+ for (const auto &Op : I.operands()) {
+ // If the operand is defined elsewhere, we use its embedding
+ if (const auto *DefInst = dyn_cast<Instruction>(Op)) {
+ auto DefIt = InstVecMap.find(DefInst);
+ // Fixme (#159171): Ideally we should never miss an instruction
+ // embedding here.
+ // But when we have cyclic dependencies (e.g., phi
+ // nodes), we might miss the embedding. In such cases, we fall back to
+ // using the vocabulary embedding. This can be fixed by iterating to a
+ // fixed-point, or by using a simple solver for the set of simultaneous
+ // equations.
+ // Another case when we might miss an instruction embedding is when
+ // the operand instruction is in a different basic block that has not
+ // been processed yet. This can be fixed by processing the basic blocks
+ // in a topological order.
+ if (DefIt != InstVecMap.end())
+ ArgEmb += DefIt->second;
+ else
ArgEmb += Vocab[*Op];
- }
}
- // Create the instruction vector by combining opcode, type, and arguments
- // embeddings
- auto InstVector =
- Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
- // Add compare predicate embedding as an additional operand if applicable
- if (const auto *IC = dyn_cast<CmpInst>(&I))
- InstVector += Vocab[IC->getPredicate()];
- InstVecMap[&I] = InstVector;
- BBVector += InstVector;
+ // If the operand is not defined by an instruction, we use the
+ // vocabulary
+ else {
+ LLVM_DEBUG(errs() << "Using embedding from vocabulary for operand: "
+ << *Op << "=" << Vocab[*Op][0] << "\n");
+ ArgEmb += Vocab[*Op];
+ }
}
- BBVecMap[&BB] = BBVector;
+ // Create the instruction vector by combining opcode, type, and arguments
+ // embeddings
+ auto InstVector =
+ Vocab[I.getOpcode()] + Vocab[I.getType()->getTypeID()] + ArgEmb;
+ if (const auto *IC = dyn_cast<CmpInst>(&I))
+ InstVector += Vocab[IC->getPredicate()];
+ InstVecMap[&I] = InstVector;
+ return InstVector;
}
// ==----------------------------------------------------------------------===//
@@ -695,25 +665,17 @@ PreservedAnalyses IR2VecPrinterPass::run(Module &M,
Emb->getFunctionVector().print(OS);
OS << "Basic block vectors:\n";
- const auto &BBMap = Emb->getBBVecMap();
for (const BasicBlock &BB : F) {
- auto It = BBMap.find(&BB);
- if (It != BBMap.end()) {
- OS << "Basic block: " << BB.getName() << ":\n";
- It->second.print(OS);
- }
+ OS << "Basic block: " << BB.getName() << ":\n";
+ Emb->getBBVector(BB).print(OS);
}
OS << "Instruction vectors:\n";
- const auto &InstMap = Emb->getInstVecMap();
for (const BasicBlock &BB : F) {
for (const Instruction &I : BB) {
- auto It = InstMap.find(&I);
- if (It != InstMap.end()) {
- OS << "Instruction: ";
- I.print(OS);
- It->second.print(OS);
- }
+ OS << "Instruction: ";
+ I.print(OS);
+ Emb->getInstVector(I).print(OS);
}
}
}
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d1977f0..4e38626 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -671,12 +671,12 @@ Value *llvm::simplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
/// This is very similar to stripAndAccumulateConstantOffsets(), except it
/// normalizes the offset bitwidth to the stripped pointer type, not the
/// original pointer type.
-static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
- bool AllowNonInbounds = false) {
+static APInt stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V) {
assert(V->getType()->isPtrOrPtrVectorTy());
APInt Offset = APInt::getZero(DL.getIndexTypeSizeInBits(V->getType()));
- V = V->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds);
+ V = V->stripAndAccumulateConstantOffsets(DL, Offset,
+ /*AllowNonInbounds=*/true);
// As that strip may trace through `addrspacecast`, need to sext or trunc
// the offset calculated.
return Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(V->getType()));
@@ -853,10 +853,12 @@ static Value *simplifySubInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
return W;
// Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
- if (match(Op0, m_PtrToInt(m_Value(X))) && match(Op1, m_PtrToInt(m_Value(Y))))
+ if (match(Op0, m_PtrToIntOrAddr(m_Value(X))) &&
+ match(Op1, m_PtrToIntOrAddr(m_Value(Y)))) {
if (Constant *Result = computePointerDifference(Q.DL, X, Y))
return ConstantFoldIntegerCast(Result, Op0->getType(), /*IsSigned*/ true,
Q.DL);
+ }
// i1 sub -> xor.
if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 6e92766..813632c 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -740,11 +740,6 @@ static void initializeLibCalls(TargetLibraryInfoImpl &TLI, const Triple &T,
TLI.setAvailable(LibFunc_fgets_unlocked);
}
- if (T.isAndroid() && T.isAndroidVersionLT(21)) {
- TLI.setUnavailable(LibFunc_stpcpy);
- TLI.setUnavailable(LibFunc_stpncpy);
- }
-
if (T.isPS()) {
// PS4/PS5 do have memalign.
TLI.setAvailable(LibFunc_memalign);
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index f6937d3..50d1d47 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -982,6 +982,7 @@ lltok::Kind LLLexer::LexIdentifier() {
DWKEYWORD(ATE, DwarfAttEncoding);
DWKEYWORD(VIRTUALITY, DwarfVirtuality);
DWKEYWORD(LANG, DwarfLang);
+ DWKEYWORD(LNAME, DwarfSourceLangName);
DWKEYWORD(CC, DwarfCC);
DWKEYWORD(OP, DwarfOp);
DWKEYWORD(MACINFO, DwarfMacinfo);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 5589966..380b192 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4740,6 +4740,10 @@ struct DwarfLangField : public MDUnsignedField {
DwarfLangField() : MDUnsignedField(0, dwarf::DW_LANG_hi_user) {}
};
+struct DwarfSourceLangNameField : public MDUnsignedField {
+ DwarfSourceLangNameField() : MDUnsignedField(0, UINT32_MAX) {}
+};
+
struct DwarfCCField : public MDUnsignedField {
DwarfCCField() : MDUnsignedField(0, dwarf::DW_CC_hi_user) {}
};
@@ -4998,6 +5002,25 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfLangField &Result) {
}
template <>
+bool LLParser::parseMDField(LocTy Loc, StringRef Name,
+ DwarfSourceLangNameField &Result) {
+ if (Lex.getKind() == lltok::APSInt)
+ return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
+
+ if (Lex.getKind() != lltok::DwarfSourceLangName)
+ return tokError("expected DWARF source language name");
+
+ unsigned Lang = dwarf::getSourceLanguageName(Lex.getStrVal());
+ if (!Lang)
+ return tokError("invalid DWARF source language name" + Twine(" '") +
+ Lex.getStrVal() + "'");
+ assert(Lang <= Result.Max && "Expected valid DWARF source language name");
+ Result.assign(Lang);
+ Lex.Lex();
+ return false;
+}
+
+template <>
bool LLParser::parseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
if (Lex.getKind() == lltok::APSInt)
return parseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
@@ -5836,9 +5859,12 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
if (!IsDistinct)
return tokError("missing 'distinct', required for !DICompileUnit");
+ LocTy Loc = Lex.getLoc();
+
#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \
- REQUIRED(language, DwarfLangField, ); \
REQUIRED(file, MDField, (/* AllowNull */ false)); \
+ OPTIONAL(language, DwarfLangField, ); \
+ OPTIONAL(sourceLanguageName, DwarfSourceLangNameField, ); \
OPTIONAL(producer, MDStringField, ); \
OPTIONAL(isOptimized, MDBoolField, ); \
OPTIONAL(flags, MDStringField, ); \
@@ -5860,12 +5886,23 @@ bool LLParser::parseDICompileUnit(MDNode *&Result, bool IsDistinct) {
PARSE_MD_FIELDS();
#undef VISIT_MD_FIELDS
+ if (!language.Seen && !sourceLanguageName.Seen)
+ return error(Loc, "missing one of 'language' or 'sourceLanguageName', "
+ "required for !DICompileUnit");
+
+ if (language.Seen && sourceLanguageName.Seen)
+ return error(Loc, "can only specify one of 'language' and "
+ "'sourceLanguageName' on !DICompileUnit");
+
Result = DICompileUnit::getDistinct(
- Context, DISourceLanguageName(language.Val), file.Val, producer.Val,
- isOptimized.Val, flags.Val, runtimeVersion.Val, splitDebugFilename.Val,
- emissionKind.Val, enums.Val, retainedTypes.Val, globals.Val, imports.Val,
- macros.Val, dwoId.Val, splitDebugInlining.Val, debugInfoForProfiling.Val,
- nameTableKind.Val, rangesBaseAddress.Val, sysroot.Val, sdk.Val);
+ Context,
+ language.Seen ? DISourceLanguageName(language.Val)
+ : DISourceLanguageName(sourceLanguageName.Val, 0),
+ file.Val, producer.Val, isOptimized.Val, flags.Val, runtimeVersion.Val,
+ splitDebugFilename.Val, emissionKind.Val, enums.Val, retainedTypes.Val,
+ globals.Val, imports.Val, macros.Val, dwoId.Val, splitDebugInlining.Val,
+ debugInfoForProfiling.Val, nameTableKind.Val, rangesBaseAddress.Val,
+ sysroot.Val, sdk.Val);
return false;
}
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 969047a..55fa2df 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -893,6 +893,8 @@ StringRef llvm::dwarf::AttributeValueString(uint16_t Attr, unsigned Val) {
return DefaultedMemberString(Val);
case DW_AT_APPLE_enum_kind:
return EnumKindString(Val);
+ case DW_AT_language_name:
+ return SourceLanguageNameString(static_cast<SourceLanguageName>(Val));
}
return StringRef();
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index a4d1b83..cdcf7a8 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1867,12 +1867,18 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
// distinct. It's always distinct.
IsDistinct = true;
+ const auto LangVersionMask = (uint64_t(1) << 63);
+ const bool HasVersionedLanguage = Record[1] & LangVersionMask;
+
auto *CU = DICompileUnit::getDistinct(
- Context, DISourceLanguageName(Record[1]), getMDOrNull(Record[2]),
- getMDString(Record[3]), Record[4], getMDString(Record[5]), Record[6],
- getMDString(Record[7]), Record[8], getMDOrNull(Record[9]),
- getMDOrNull(Record[10]), getMDOrNull(Record[12]),
- getMDOrNull(Record[13]),
+ Context,
+ HasVersionedLanguage
+ ? DISourceLanguageName(Record[1] & ~LangVersionMask, 0)
+ : DISourceLanguageName(Record[1]),
+ getMDOrNull(Record[2]), getMDString(Record[3]), Record[4],
+ getMDString(Record[5]), Record[6], getMDString(Record[7]), Record[8],
+ getMDOrNull(Record[9]), getMDOrNull(Record[10]),
+ getMDOrNull(Record[12]), getMDOrNull(Record[13]),
Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
Record.size() <= 14 ? 0 : Record[14],
Record.size() <= 16 ? true : Record[16],
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7ed140d..0ca55a26 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2108,7 +2108,13 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
assert(N->isDistinct() && "Expected distinct compile units");
Record.push_back(/* IsDistinct */ true);
- Record.push_back(N->getSourceLanguage().getUnversionedName());
+ auto Lang = N->getSourceLanguage();
+ Record.push_back(Lang.getName());
+ // Set bit so the MetadataLoader can distniguish between versioned and
+ // unversioned names.
+ if (Lang.hasVersionedName())
+ Record.back() ^= (uint64_t(1) << 63);
+
Record.push_back(VE.getMetadataOrNullID(N->getFile()));
Record.push_back(VE.getMetadataOrNullID(N->getRawProducer()));
Record.push_back(N->isOptimized());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index bc0bb34..f0f0861 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -587,10 +587,12 @@ bool DwarfExpression::addExpression(
emitUnsigned(LeftShift);
emitOp(dwarf::DW_OP_shl);
}
- emitOp(dwarf::DW_OP_constu);
- emitUnsigned(RightShift);
- emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra
- : dwarf::DW_OP_shr);
+ if (RightShift) {
+ emitOp(dwarf::DW_OP_constu);
+ emitUnsigned(RightShift);
+ emitOp(OpNum == dwarf::DW_OP_LLVM_extract_bits_sext ? dwarf::DW_OP_shra
+ : dwarf::DW_OP_shr);
+ }
// The value is now at the top of the stack, so set the location to
// implicit so that we get a stack_value at the end.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index eb73d01b..4320b1d 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -3194,7 +3194,7 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
case ScaledRegField:
return ScaledReg;
case BaseOffsField:
- return ConstantInt::get(IntPtrTy, BaseOffs);
+ return ConstantInt::getSigned(IntPtrTy, BaseOffs);
}
}
@@ -6100,7 +6100,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
// Add in the Base Offset if present.
if (AddrMode.BaseOffs) {
- Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+ Value *V = ConstantInt::getSigned(IntPtrTy, AddrMode.BaseOffs);
if (ResultIndex) {
// We need to add this separately from the scale above to help with
// SDAG consecutive load/store merging.
@@ -6226,7 +6226,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
// Add in the Base Offset if present.
if (AddrMode.BaseOffs) {
- Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+ Value *V = ConstantInt::getSigned(IntPtrTy, AddrMode.BaseOffs);
if (Result)
Result = Builder.CreateAdd(Result, V, "sunkaddr");
else
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 906d62a3..b425b95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1728,6 +1728,7 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI,
Result.clearSign();
return Result;
}
+ case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTRUNC: {
bool Unused;
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 3f6813e..90c60d4 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -344,6 +344,22 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
Known = KnownBits::mul(Known, Known2);
break;
}
+ case TargetOpcode::G_UMULH: {
+ computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+ Depth + 1);
+ computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+ Depth + 1);
+ Known = KnownBits::mulhu(Known, Known2);
+ break;
+ }
+ case TargetOpcode::G_SMULH: {
+ computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+ Depth + 1);
+ computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+ Depth + 1);
+ Known = KnownBits::mulhs(Known, Known2);
+ break;
+ }
case TargetOpcode::G_SELECT: {
computeKnownBitsMin(MI.getOperand(2).getReg(), MI.getOperand(3).getReg(),
Known, DemandedElts, Depth + 1);
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index ebfea8e..e17a214 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -2051,6 +2051,12 @@ bool RegisterCoalescer::joinCopy(
}
if (CP.getNewRC()) {
+ if (RegClassInfo.getNumAllocatableRegs(CP.getNewRC()) == 0) {
+ LLVM_DEBUG(dbgs() << "\tNo " << TRI->getRegClassName(CP.getNewRC())
+ << "are available for allocation\n");
+ return false;
+ }
+
auto SrcRC = MRI->getRegClass(CP.getSrcReg());
auto DstRC = MRI->getRegClass(CP.getDstReg());
unsigned SrcIdx = CP.getSrcIdx();
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index edc69a3..212a0c0 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -149,7 +149,8 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
if (!Name.empty())
WithColor(OS, Color) << Name;
else if (Attr == DW_AT_decl_line || Attr == DW_AT_decl_column ||
- Attr == DW_AT_call_line || Attr == DW_AT_call_column) {
+ Attr == DW_AT_call_line || Attr == DW_AT_call_column ||
+ Attr == DW_AT_language_version) {
if (std::optional<uint64_t> Val = FormValue.getAsUnsignedConstant())
OS << *Val;
else
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index ae086bcd..0bc877d 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -2370,10 +2370,16 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
Out << "!DICompileUnit(";
MDFieldPrinter Printer(Out, WriterCtx);
- Printer.printDwarfEnum("language",
- N->getSourceLanguage().getUnversionedName(),
- dwarf::LanguageString,
- /* ShouldSkipZero */ false);
+ auto Lang = N->getSourceLanguage();
+ if (Lang.hasVersionedName())
+ Printer.printDwarfEnum(
+ "sourceLanguageName",
+ static_cast<llvm::dwarf::SourceLanguageName>(Lang.getName()),
+ dwarf::SourceLanguageNameString,
+ /* ShouldSkipZero */ false);
+ else
+ Printer.printDwarfEnum("language", Lang.getName(), dwarf::LanguageString,
+ /* ShouldSkipZero */ false);
Printer.printMetadata("file", N->getRawFile(), /* ShouldSkipNull */ false);
Printer.printString("producer", N->getProducer());
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 7509188..fba6942 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -391,3 +391,23 @@ ConstantFPRange ConstantFPRange::unionWith(const ConstantFPRange &CR) const {
return ConstantFPRange(minnum(Lower, CR.Lower), maxnum(Upper, CR.Upper),
MayBeQNaN | CR.MayBeQNaN, MayBeSNaN | CR.MayBeSNaN);
}
+
+ConstantFPRange ConstantFPRange::abs() const {
+ if (isNaNOnly())
+ return *this;
+ // Check if the range is all non-negative or all non-positive.
+ if (Lower.isNegative() == Upper.isNegative()) {
+ if (Lower.isNegative())
+ return negate();
+ return *this;
+ }
+ // The range contains both positive and negative values.
+ APFloat NewLower = APFloat::getZero(getSemantics());
+ APFloat NewUpper = maxnum(-Lower, Upper);
+ return ConstantFPRange(std::move(NewLower), std::move(NewUpper), MayBeQNaN,
+ MayBeSNaN);
+}
+
+ConstantFPRange ConstantFPRange::negate() const {
+ return ConstantFPRange(-Upper, -Lower, MayBeQNaN, MayBeSNaN);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 639ddcb..ecaeff7 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering
// Post-legalization combines which are primarily optimizations.
def AArch64PostLegalizerCombiner
: GICombiner<"AArch64PostLegalizerCombinerImpl",
- [copy_prop, cast_of_cast_combines,
+ [copy_prop, cast_of_cast_combines, constant_fold_fp_ops,
buildvector_of_truncate, integer_of_truncate,
mutate_anyext_to_zext, combines_for_extload,
combine_indexed_load_store, sext_trunc_sextload,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 31b3d18..7294f3e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16249,7 +16249,9 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
bool Negated;
uint64_t SplatVal;
- if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+ // NOTE: SRAD cannot be used to represent sdiv-by-one.
+ if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
+ SplatVal > 1) {
SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
SDValue Res =
DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
@@ -30034,7 +30036,9 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
bool Negated;
uint64_t SplatVal;
- if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
+ // NOTE: SRAD cannot be used to represent sdiv-by-one.
+ if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
+ SplatVal > 1) {
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
@@ -30606,6 +30610,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
+ if (Op->getNumOperands() == 3) {
+ // aarch64_sve_ld3 only supports packed datatypes.
+ EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+ Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+ // Write out unmodified operands.
+ SmallVector<SDValue, 3> Chains;
+ for (unsigned I = 0; I < 3; ++I) {
+ SDValue Ptr =
+ DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+ SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
+ Chains.push_back(
+ DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
+ }
+
+ Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
+ EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+ SmallVector<SDValue, 7> Ops;
+ Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
+ Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+ Ops.push_back(DAG.getConstant(1, DL, PredVT));
+ Ops.push_back(StackPtr);
+
+ // Read back and deinterleave data.
+ SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
+ SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
+
+ SmallVector<SDValue, 3> Results;
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
+ return DAG.getMergeValues(Results, DL);
+ }
+
// Are multi-register uzp instructions available?
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
OpVT.getVectorElementType() != MVT::i1) {
@@ -30647,6 +30688,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
+ if (Op->getNumOperands() == 3) {
+ // aarch64_sve_st3 only supports packed datatypes.
+ EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+ SmallVector<SDValue, 3> InVecs;
+ for (SDValue V : Op->ops())
+ InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
+
+ Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+ Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
+ EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+ SmallVector<SDValue, 7> Ops;
+ Ops.push_back(DAG.getEntryNode());
+ Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+ Ops.append(InVecs);
+ Ops.push_back(DAG.getConstant(1, DL, PredVT));
+ Ops.push_back(StackPtr);
+
+ // Interleave operands and store.
+ SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
+
+ // Read back the interleaved data.
+ SmallVector<SDValue, 3> Results;
+ for (unsigned I = 0; I < 3; ++I) {
+ SDValue Ptr =
+ DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+ SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());
+ Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
+ }
+
+ return DAG.getMergeValues(Results, DL);
+ }
+
// Are multi-register zip instructions available?
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
OpVT.getVectorElementType() != MVT::i1) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 24bef82..8e35ba7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "SIModeRegisterDefaults.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
@@ -27,6 +28,7 @@
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
@@ -106,6 +108,7 @@ public:
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
mutable Function *LdexpF32 = nullptr;
+ mutable SmallVector<WeakVH> DeadVals;
DenseMap<const PHINode *, bool> BreakPhiNodesCache;
@@ -242,6 +245,8 @@ public:
Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
FastMathFlags FMF) const;
+ bool tryNarrowMathIfNoOverflow(Instruction *I);
+
public:
bool visitFDiv(BinaryOperator &I);
@@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() {
BreakPhiNodesCache.clear();
bool MadeChange = false;
- Function::iterator NextBB;
- for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
- BasicBlock *BB = &*FI;
- NextBB = std::next(FI);
-
- BasicBlock::iterator Next;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
- I = Next) {
- Next = std::next(I);
-
- MadeChange |= visit(*I);
-
- if (Next != E) { // Control flow changed
- BasicBlock *NextInstBB = Next->getParent();
- if (NextInstBB != BB) {
- BB = NextInstBB;
- E = BB->end();
- FE = F.end();
- }
- }
+ // Need to use make_early_inc_range because integer division expansion is
+ // handled by Transform/Utils, and it can delete instructions such as the
+ // terminator of the BB.
+ for (BasicBlock &BB : reverse(F)) {
+ for (Instruction &I : make_early_inc_range(reverse(BB))) {
+ if (!isInstructionTriviallyDead(&I, TLI))
+ MadeChange |= visit(I);
}
}
+
+ while (!DeadVals.empty()) {
+ if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
+ RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+ }
+
return MadeChange;
}
@@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
Value *NewVal = insertValues(Builder, Ty, ResultVals);
NewVal->takeName(&I);
I.replaceAllUsesWith(NewVal);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
FoldedT, FoldedF);
NewSelect->takeName(&BO);
BO.replaceAllUsesWith(NewSelect);
- BO.eraseFromParent();
+ DeadVals.push_back(&BO);
if (CastOp)
- CastOp->eraseFromParent();
- Sel->eraseFromParent();
+ DeadVals.push_back(CastOp);
+ DeadVals.push_back(Sel);
return true;
}
@@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
if (NewVal) {
FDiv.replaceAllUsesWith(NewVal);
NewVal->takeName(&FDiv);
- RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI);
+ DeadVals.push_back(&FDiv);
}
return true;
@@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`.
We accept this change since the non-byte load assumes the upper bits
within the byte are all 0.
*/
-static bool tryNarrowMathIfNoOverflow(Instruction *I,
- const SITargetLowering *TLI,
- const TargetTransformInfo &TTI,
- const DataLayout &DL) {
+bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
unsigned Opc = I->getOpcode();
Type *OldType = I->getType();
@@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
NewType = I->getType()->getWithNewBitWidth(NewBit);
// Old cost
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
// New cost of new op
@@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
Value *Zext = Builder.CreateZExt(Arith, OldType);
I->replaceAllUsesWith(Zext);
- I->eraseFromParent();
+ DeadVals.push_back(I);
return true;
}
@@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (UseMul24Intrin && replaceMulWithMul24(I))
return true;
- if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
- TM.getTargetTransformInfo(F), DL))
+ if (tryNarrowMathIfNoOverflow(&I))
return true;
bool Changed = false;
@@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (NewDiv) {
I.replaceAllUsesWith(NewDiv);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
Changed = true;
}
}
@@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
I.replaceAllUsesWith(ValOrig);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
Fract->takeName(&I);
I.replaceAllUsesWith(Fract);
- RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
+ DeadVals.push_back(&I);
return true;
}
@@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
}
I.replaceAllUsesWith(Vec);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
auto *Intrin = B.CreateIntrinsic(
I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
I.replaceAllUsesWith(Intrin);
- I.eraseFromParent();
+ DeadVals.push_back(&I);
return true;
}
@@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
Value *Fract = applyFractPat(Builder, FractArg);
Fract->takeName(&I);
I.replaceAllUsesWith(Fract);
-
- RecursivelyDeleteTriviallyDeadInstructions(&I, TLI);
+ DeadVals.push_back(&I);
return true;
}
-static bool isOneOrNegOne(const Value *Val) {
- const APFloat *C;
- return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
-}
-
// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
Type *Ty = Sqrt.getType()->getScalarType();
@@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
if (ReqdAccuracy < 1.0f)
return false;
- // FIXME: This is an ugly hack for this pass using forward iteration instead
- // of reverse. If it worked like a normal combiner, the rsq would form before
- // we saw a sqrt call.
- auto *FDiv =
- dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
- if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
- FDiv->getFPAccuracy() >= 1.0f &&
- canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
- // TODO: We should also handle the arcp case for the fdiv with non-1 value
- isOneOrNegOne(FDiv->getOperand(0)))
- return false;
-
Value *SrcVal = Sqrt.getOperand(0);
bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
@@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
NewSqrt->takeName(&Sqrt);
Sqrt.replaceAllUsesWith(NewSqrt);
- Sqrt.eraseFromParent();
+ DeadVals.push_back(&Sqrt);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 73b2660..5407566 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -468,6 +468,38 @@ void RegBankLegalizeHelper::lowerUnpackBitShift(MachineInstr &MI) {
MI.eraseFromParent();
}
+void RegBankLegalizeHelper::lowerUnpackMinMax(MachineInstr &MI) {
+ Register Lo, Hi;
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_SMIN:
+ case AMDGPU::G_SMAX: {
+ // For signed operations, use sign extension
+ auto [Val0_Lo, Val0_Hi] = unpackSExt(MI.getOperand(1).getReg());
+ auto [Val1_Lo, Val1_Hi] = unpackSExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
+ .getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
+ .getReg(0);
+ break;
+ }
+ case AMDGPU::G_UMIN:
+ case AMDGPU::G_UMAX: {
+ // For unsigned operations, use zero extension
+ auto [Val0_Lo, Val0_Hi] = unpackZExt(MI.getOperand(1).getReg());
+ auto [Val1_Lo, Val1_Hi] = unpackZExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Lo, Val1_Lo})
+ .getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0_Hi, Val1_Hi})
+ .getReg(0);
+ break;
+ }
+ default:
+ llvm_unreachable("Unpack min/max lowering not implemented");
+ }
+ B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
+ MI.eraseFromParent();
+}
+
static bool isSignedBFE(MachineInstr &MI) {
if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))
return (GI->is(Intrinsic::amdgcn_sbfe));
@@ -654,6 +686,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
case UnpackBitShift:
return lowerUnpackBitShift(MI);
+ case UnpackMinMax:
+ return lowerUnpackMinMax(MI);
case Ext32To64: {
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
MachineInstrBuilder Hi;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index 7affe5a..d937815 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -123,6 +123,7 @@ private:
void lowerSplitTo32(MachineInstr &MI);
void lowerSplitTo32Select(MachineInstr &MI);
void lowerSplitTo32SExtInReg(MachineInstr &MI);
+ void lowerUnpackMinMax(MachineInstr &MI);
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index f413bbc..7392f4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -522,6 +522,22 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
+ addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
+ addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, UnpackMinMax})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
+
// Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
// and G_FREEZE here, rest is trivially regbankselected earlier
addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index d0c6910..93e0efd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -212,6 +212,7 @@ enum LoweringMethodID {
VccExtToSel,
UniExtToSel,
UnpackBitShift,
+ UnpackMinMax,
S_BFE,
V_BFE,
VgprToVccCopy,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 557d87f..56807a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5053,16 +5053,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
//
// vdst, srcA, srcB, srcC
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+ bool UseAGPRForm = !Subtarget.hasGFX90AInsts() ||
+ Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
OpdsMapping[0] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[4] =
- Info->getMinNumAGPRs() >= MinNumRegsRequired
- ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
- : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
@@ -5115,11 +5117,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8: {
+ Register DstReg = MI.getOperand(0).getReg();
+ unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+ unsigned MinNumRegsRequired = DstSize / 32;
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ bool UseAGPRForm = Info->selectAGPRFormMFMA(MinNumRegsRequired);
+
// vdst, srcA, srcB, srcC, idx
- OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = UseAGPRForm ? getAGPROpMapping(DstReg, MRI, *TRI)
+ : getVGPROpMapping(DstReg, MRI, *TRI);
+
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
- OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[4] =
+ UseAGPRForm ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
+ : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index ef63acc..71494be 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -905,7 +905,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
OS << ":\n";
SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB);
- SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB);
+ SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB);
GCNRPTracker::LiveRegSet LiveIn, LiveOut;
GCNRegPressure RPAtMBBEnd;
@@ -931,7 +931,7 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
}
} else {
GCNUpwardRPTracker RPT(LIS);
- RPT.reset(MRI, MBBEndSlot);
+ RPT.reset(MRI, MBBLastSlot);
LiveOut = RPT.getLiveRegs();
RPAtMBBEnd = RPT.getPressure();
@@ -966,14 +966,14 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
OS << PFX " Live-out:" << llvm::print(LiveOut, MRI);
if (UseDownwardTracker)
- ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI));
+ ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBLastSlot, LIS, MRI));
GCNRPTracker::LiveRegSet LiveThrough;
for (auto [Reg, Mask] : LiveIn) {
LaneBitmask MaskIntersection = Mask & LiveOut.lookup(Reg);
if (MaskIntersection.any()) {
LaneBitmask LTMask = getRegLiveThroughMask(
- MRI, LIS, Reg, MBBStartSlot, MBBEndSlot, MaskIntersection);
+ MRI, LIS, Reg, MBBStartSlot, MBBLastSlot, MaskIntersection);
if (LTMask.any())
LiveThrough[Reg] = LTMask;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index a9c58bb..898d1ff 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -313,8 +313,8 @@ public:
/// reset tracker to the end of the \p MBB.
void reset(const MachineBasicBlock &MBB) {
- reset(MBB.getParent()->getRegInfo(),
- LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
+ SlotIndex MBBLastSlot = LIS.getSlotIndexes()->getMBBLastIdx(&MBB);
+ reset(MBB.getParent()->getRegInfo(), MBBLastSlot);
}
/// reset tracker to the point just after \p MI (in program order).
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 730be69..80e985d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -103,52 +103,52 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::Untyped, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
+ addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
+ addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
+ addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
+ addRegisterClass(MVT::v6f32, &AMDGPU::VReg_192RegClass);
addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
- addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
+ addRegisterClass(MVT::v3f64, &AMDGPU::VReg_192RegClass);
addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
- addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
+ addRegisterClass(MVT::v7f32, &AMDGPU::VReg_224RegClass);
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
+ addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
+ addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
- addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
+ addRegisterClass(MVT::v9f32, &AMDGPU::VReg_288RegClass);
addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
- addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
+ addRegisterClass(MVT::v10f32, &AMDGPU::VReg_320RegClass);
addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
- addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
+ addRegisterClass(MVT::v11f32, &AMDGPU::VReg_352RegClass);
addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
- addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
+ addRegisterClass(MVT::v12f32, &AMDGPU::VReg_384RegClass);
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
+ addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
+ addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
+ addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
if (Subtarget->has16BitInsts()) {
if (Subtarget->useRealTrue16Insts()) {
@@ -180,7 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
+ addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
computeRegisterProperties(Subtarget->getRegisterInfo());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index b7dbb59..2c1a13c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1202,6 +1202,12 @@ public:
unsigned getMinNumAGPRs() const { return MinNumAGPRs; }
+ /// Return true if an MFMA that requires at least \p NumRegs should select to
+ /// the AGPR form, instead of the VGPR form.
+ bool selectAGPRFormMFMA(unsigned NumRegs) const {
+ return !MFMAVGPRForm && getMinNumAGPRs() >= NumRegs;
+ }
+
// \returns true if a function has a use of AGPRs via inline asm or
// has a call which may use it.
bool mayUseAGPRs(const Function &F) const;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 7cfd059..6500fce 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -964,14 +964,12 @@ class MAIFrag<SDPatternOperator Op, bit HasAbid = true, bit Scaled = false> : Pa
class CanUseAGPR_MAI<ValueType vt> {
code PredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF->getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
- }] # !srl(vt.Size, 5) # ");";
+ MF->getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
+ }] # !srl(vt.Size, 5) # ");";
code GISelPredicateCode = [{
return !Subtarget->hasGFX90AInsts() ||
- (!SIMachineFunctionInfo::MFMAVGPRForm &&
- MF.getInfo<SIMachineFunctionInfo>()->getMinNumAGPRs() >=
+ MF.getInfo<SIMachineFunctionInfo>()->selectAGPRFormMFMA(
}] # !srl(vt.Size, 5) # ");";
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2a40fb9..83c7def 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -42,7 +42,6 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index a0acfcf..85ce944 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -699,35 +699,20 @@ def: OpR_RR_pat<C2_cmpgtp, setgt, i1, I64>;
def: OpR_RR_pat<C2_cmpgtup, setugt, i1, I64>;
def: OpR_RR_pat<C2_cmpgtp, RevCmp<setlt>, i1, I64>;
def: OpR_RR_pat<C2_cmpgtup, RevCmp<setult>, i1, I64>;
-def: OpR_RR_pat<A2_vcmpbeq, seteq, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbeq, seteq, v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, i1, V8I8>;
def: OpR_RR_pat<A4_vcmpbgt, RevCmp<setlt>, v8i1, V8I8>;
-def: OpR_RR_pat<A4_vcmpbgt, setgt, i1, V8I8>;
def: OpR_RR_pat<A4_vcmpbgt, setgt, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbgtu, RevCmp<setult>, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpbgtu, setugt, i1, V8I8>;
def: OpR_RR_pat<A2_vcmpbgtu, setugt, v8i1, V8I8>;
-def: OpR_RR_pat<A2_vcmpheq, seteq, i1, V4I16>;
def: OpR_RR_pat<A2_vcmpheq, seteq, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgt, RevCmp<setlt>, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgt, setgt, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgt, setgt, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgtu, RevCmp<setult>, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmphgtu, setugt, i1, V4I16>;
def: OpR_RR_pat<A2_vcmphgtu, setugt, v4i1, V4I16>;
-def: OpR_RR_pat<A2_vcmpweq, seteq, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpweq, seteq, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgt, RevCmp<setlt>, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgt, setgt, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgt, setgt, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, RevCmp<setult>, v2i1, V2I32>;
-def: OpR_RR_pat<A2_vcmpwgtu, setugt, i1, V2I32>;
def: OpR_RR_pat<A2_vcmpwgtu, setugt, v2i1, V2I32>;
def: OpR_RR_pat<F2_sfcmpeq, seteq, i1, F32>;
@@ -1213,12 +1198,6 @@ def: OpR_RI_pat<S2_asl_i_r, Shl, i32, I32, u5_0ImmPred>;
def: OpR_RI_pat<S2_asr_i_p, Sra, i64, I64, u6_0ImmPred>;
def: OpR_RI_pat<S2_lsr_i_p, Srl, i64, I64, u6_0ImmPred>;
def: OpR_RI_pat<S2_asl_i_p, Shl, i64, I64, u6_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v4i16, V4I16, u4_0ImmPred>;
-def: OpR_RI_pat<S2_asr_i_vh, Sra, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_lsr_i_vh, Srl, v2i32, V2I32, u5_0ImmPred>;
-def: OpR_RI_pat<S2_asl_i_vh, Shl, v2i32, V2I32, u5_0ImmPred>;
def: OpR_RR_pat<S2_asr_r_r, Sra, i32, I32, I32>;
def: OpR_RR_pat<S2_lsr_r_r, Srl, i32, I32, I32>;
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index ba70c9e..97379d7 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -3677,7 +3677,7 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
Out, STI))
return true;
- if (IsLikely) {
+ if (IsLikely && MemOffsetOp.isExpr()) {
TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg,
MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index eff80e5..21d8ded 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -855,6 +855,16 @@ def calltarget : Operand<iPTR> {
def imm64: Operand<i64>;
+def ConstantImmAsmOperandClass : AsmOperandClass {
+ let Name = "ConstantImm";
+ let PredicateMethod = "isConstantImm";
+ let RenderMethod = "addImmOperands";
+}
+
+def ConstantImm64: Operand<i64> {
+ let ParserMatchClass = ConstantImmAsmOperandClass;
+}
+
def simm19_lsl2 : Operand<i32> {
let EncoderMethod = "getSimm19Lsl2Encoding";
let DecoderMethod = "DecodeSimm19Lsl2";
@@ -2947,10 +2957,10 @@ def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs,
let hasDelaySlot = 1, isCTI = 1 in {
def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
- (ins imm64:$imm64, brtarget:$offset),
+ (ins ConstantImm64:$imm64, brtarget:$offset),
"bne\t$rt, $imm64, $offset">;
def BeqImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
- (ins imm64:$imm64, brtarget:$offset),
+ (ins ConstantImm64:$imm64, brtarget:$offset),
"beq\t$rt, $imm64, $offset">;
class CondBranchPseudo<string instr_asm> :
@@ -2978,7 +2988,7 @@ def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
let isCTI = 1 in
class CondBranchImmPseudo<string instr_asm> :
- MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
+ MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, ConstantImm64:$imm, brtarget:$offset),
!strconcat(instr_asm, "\t$rs, $imm, $offset")>;
def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 40c05e8..5ceb477 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1520,6 +1520,8 @@ def HasVendorXqcics
: Predicate<"Subtarget->hasVendorXqcics()">,
AssemblerPredicate<(all_of FeatureVendorXqcics),
"'Xqcics' (Qualcomm uC Conditional Select Extension)">;
+def NoVendorXqcics
+ : Predicate<"!Subtarget->hasVendorXqcics()">;
def FeatureVendorXqcicsr
: RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">;
@@ -1823,6 +1825,11 @@ def TuneConditionalCompressedMoveFusion
def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">;
def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">;
+def TuneHasSingleElementVecFP64
+ : SubtargetFeature<"single-element-vec-fp64", "HasSingleElementVectorFP64", "true",
+ "Certain vector FP64 operations produce a single result "
+ "element per cycle">;
+
def TuneMIPSP8700
: SubtargetFeature<"mips-p8700", "RISCVProcFamily", "MIPSP8700",
"MIPS p8700 processor">;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index f2724c41..5e1d07a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -1571,35 +1571,42 @@ def : QCIMVCCIPat<SETUGE, QC_MVGEUI, uimm5nonzero>;
}
let Predicates = [HasVendorXqcicli, IsRV32] in {
-def : QCILICCPat<SETEQ, QC_LIEQ>;
-def : QCILICCPat<SETNE, QC_LINE>;
def : QCILICCPat<SETLT, QC_LILT>;
def : QCILICCPat<SETGE, QC_LIGE>;
def : QCILICCPat<SETULT, QC_LILTU>;
def : QCILICCPat<SETUGE, QC_LIGEU>;
-def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>;
-def : QCILICCIPat<SETNE, QC_LINEI, simm5>;
def : QCILICCIPat<SETLT, QC_LILTI, simm5>;
def : QCILICCIPat<SETGE, QC_LIGEI, simm5>;
def : QCILICCIPat<SETULT, QC_LILTUI, uimm5>;
def : QCILICCIPat<SETUGE, QC_LIGEUI, uimm5>;
-def : QCILICCPatInv<SETNE, QC_LIEQ>;
-def : QCILICCPatInv<SETEQ, QC_LINE>;
def : QCILICCPatInv<SETGE, QC_LILT>;
def : QCILICCPatInv<SETLT, QC_LIGE>;
def : QCILICCPatInv<SETUGE, QC_LILTU>;
def : QCILICCPatInv<SETULT, QC_LIGEU>;
-def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>;
-def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>;
def : QCILICCIPatInv<SETGE, QC_LILTI, simm5>;
def : QCILICCIPatInv<SETLT, QC_LIGEI, simm5>;
def : QCILICCIPatInv<SETUGE, QC_LILTUI, uimm5>;
def : QCILICCIPatInv<SETULT, QC_LIGEUI, uimm5>;
} // Predicates = [HasVendorXqcicli, IsRV32]
+// Prioritize Xqcics over these patterns.
+let Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32] in {
+def : QCILICCPat<SETEQ, QC_LIEQ>;
+def : QCILICCPat<SETNE, QC_LINE>;
+
+def : QCILICCIPat<SETEQ, QC_LIEQI, simm5>;
+def : QCILICCIPat<SETNE, QC_LINEI, simm5>;
+
+def : QCILICCPatInv<SETNE, QC_LIEQ>;
+def : QCILICCPatInv<SETEQ, QC_LINE>;
+
+def : QCILICCIPatInv<SETNE, QC_LIEQI, simm5>;
+def : QCILICCIPatInv<SETEQ, QC_LINEI, simm5>;
+} // Predicates = [HasVendorXqcicli, NoVendorXqcics, IsRV32]
+
let Predicates = [HasVendorXqcics, IsRV32] in {
// (SELECT X, Y, Z) is canonicalised to `(riscv_selectcc x, 0, NE, y, z)`.
// These exist to prioritise over the `Select_GPR_Using_CC_GPR` pattern.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 6d86aff..3658817 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -14,6 +14,10 @@
// otherwise.
def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>;
+// This scheduling predicate is true when subtarget feature TuneHasSingleElementVecFP64
+// is enabled.
+def SingleElementVecFP64SchedPred : FeatureSchedPredicate<TuneHasSingleElementVecFP64>;
+
// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
def isSEXT_W
: TIIPredicate<"isSEXT_W",
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 17a7948..e86431f 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -338,7 +338,8 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390",
FeatureStdExtZvl1024b,
FeatureVendorXSiFivecdiscarddlone,
FeatureVendorXSiFivecflushdlone],
- SiFiveIntelligenceTuneFeatures>;
+ !listconcat(SiFiveIntelligenceTuneFeatures,
+ [TuneHasSingleElementVecFP64])>;
defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
TuneConditionalCompressedMoveFusion,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 3e07eff..f863392a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -317,7 +317,6 @@ multiclass SiFive7WriteResBase<int VLEN,
ProcResourceKind VL, ProcResourceKind VS,
ProcResourceKind VCQ,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled = false,
bit hasFastGather = false> {
// Branching
@@ -832,29 +831,56 @@ multiclass SiFive7WriteResBase<int VLEN,
// 13. Vector Floating-Point Instructions
foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 64)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
- defvar Lat8 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 8);
- defvar VA = !if(!and(isFP64Throttled, !eq(sew, 64)), VA1, VA1OrVA2);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
- let Latency = Lat8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
- }
- defvar Lat4 = !if(!and(isFP64Throttled, !eq(sew, 64)), Cycles, 4);
- let Latency = Lat4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA], mx, sew, IsWorstCase>;
- // min max require merge
- defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+ if !eq(sew, 64) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ foreach SchedWriteName = ["WriteVFALUV", "WriteVFALUF", "WriteVFMulV", "WriteVFMulF",
+ "WriteVFMulAddV", "WriteVFMulAddF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1OrVA2], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFRecpV", "WriteVFCvtIToFV"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 7), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFSgnjV", "WriteVFSgnjF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1OrVA2], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ foreach SchedWriteName = ["WriteVFMinMaxV", "WriteVFMinMaxF"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], !add(SingleElementCycles, 3), [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 4, [0, 1], [1, !add(1, SiFive7GetCyclesDefault<mx>.c)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFRecpV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
+ let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, SiFive7GetCyclesDefault<mx>.c)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
+ // min max require merge
+ defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
}
@@ -892,19 +918,28 @@ multiclass SiFive7WriteResBase<int VLEN,
// Widening
foreach mx = SchedMxListW in {
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
- defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtIToFV", SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+ defm : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
- defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+ defvar DefaultCycles = SiFive7GetCyclesDefault<mx>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
+ let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
defm : LMULSEWWriteResMXSEW<"WriteVFWALUV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWALUF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWMulV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
@@ -912,11 +947,19 @@ multiclass SiFive7WriteResBase<int VLEN,
defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
defm : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [VCQ, VA1OrVA2], mx, sew, IsWorstCase>;
}
- defvar CvtCycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesDefault<mx>.c);
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, CvtCycles)] in
- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ defm : LMULSEWWriteResMXSEWVariant<"WriteVFWCvtFToFV", SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in
+ defm : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxListFW>.c;
@@ -933,13 +976,23 @@ multiclass SiFive7WriteResBase<int VLEN,
}
foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
- defvar Cycles = !if(!and(isFP64Throttled, !eq(sew, 32)),
- SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c,
- SiFive7GetCyclesNarrowing<mx>.c);
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- let Latency = 8, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defvar DefaultCycles = SiFive7GetCyclesNarrowing<mx>.c;
+ if !eq(sew, 32) then {
+ defvar SingleElementCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ foreach SchedWriteName = ["WriteVFNCvtIToFV", "WriteVFNCvtFToFV"] in
+ defm : LMULSEWWriteResMXSEWVariant<SchedWriteName, SingleElementVecFP64SchedPred,
+ // Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, SingleElementCycles)],
+ // Not Predicated
+ [VCQ, VA1], 8, [0, 1], [1, !add(1, DefaultCycles)],
+ mx, sew, IsWorstCase>;
+ } else {
+ let Latency = 8,
+ AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, DefaultCycles)] in {
+ defm : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ defm : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [VCQ, VA1], mx, sew, IsWorstCase>;
+ }
}
}
}
@@ -1499,7 +1552,6 @@ multiclass SiFive7ReadAdvance {
/// eventually be supplied by different SchedMachineModels.
multiclass SiFive7SchedResources<int vlen, bit extraVALU,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled,
bit hasFastGather> {
defm SiFive7 : SiFive7ProcResources<extraVALU>;
@@ -1527,8 +1579,7 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
: SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
- SiFive7VCQ, fpLatencies, isFP64Throttled,
- hasFastGather>;
+ SiFive7VCQ, fpLatencies, hasFastGather>;
//===----------------------------------------------------------------------===//
// Bypass and advance
@@ -1560,7 +1611,6 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
bit HasExtraVALU = false;
SiFive7FPLatencies FPLatencies;
- bit IsFP64Throttled = false;
bit HasFastGather = false;
string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
@@ -1587,7 +1637,6 @@ def SiFive7VLEN512Model : SiFive7SchedMachineModel<512> {
def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
let HasExtraVALU = true;
let FPLatencies = SiFive7LowFPLatencies;
- let IsFP64Throttled = true;
let HasFastGather = true;
}
@@ -1596,7 +1645,6 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
let SchedModel = model in
defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
model.FPLatencies,
- model.IsFP64Throttled,
model.HasFastGather>;
}
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 01a4308..d11b446 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -128,6 +128,22 @@ multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
IsWorstCase>;
}
+multiclass LMULSEWWriteResMXSEWVariant<string name, SchedPredicateBase Pred,
+ list<ProcResourceKind> predResources,
+ int predLat, list<int> predAcquireCycles,
+ list<int> predReleaseCycles,
+ list<ProcResourceKind> noPredResources,
+ int noPredLat, list<int> noPredAcquireCycles,
+ list<int> noPredReleaseCycles,
+ string mx, int sew, bit IsWorstCase> {
+ defm "" : LMULWriteResVariantImpl<name, name # "_" # mx # "_E" # sew, Pred, predResources,
+ predLat, predAcquireCycles,
+ predReleaseCycles, noPredResources,
+ noPredLat, noPredAcquireCycles,
+ noPredReleaseCycles,
+ IsWorstCase>;
+}
+
// Define multiclasses to define SchedWrite, SchedRead, WriteRes, and
// ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the
// SchedMxList variants above. Each multiclass is responsible for defining
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index e8c849e..28a1690 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -46,7 +46,6 @@
#include "SPIRVSubtarget.h"
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 20f03b0..60d39c9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -19,7 +19,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Intrinsics.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index 278ad7c..e621bcd44 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -14,7 +14,6 @@
#include "SPIRV.h"
#include "SPIRVSubtarget.h"
#include "SPIRVUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Transforms/Utils/Cloning.h"
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 1811492..5b149f8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -16,7 +16,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9580ade..1cfcb1f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 3bc46af..6dd43b2 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -547,7 +547,7 @@ unsigned X86TargetLowering::getAddressSpace() const {
static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
- (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+ TargetTriple.isAndroid();
}
static Constant* SegmentOffset(IRBuilderBase &IRB,
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index cfdfd94..5066a99 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1033,19 +1033,17 @@ private:
};
} // namespace
-namespace llvm {
template <>
-struct DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<typename CallsiteContextGraph<
ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
: public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
template <>
-struct DenseMapInfo<typename CallsiteContextGraph<
+struct llvm::DenseMapInfo<typename CallsiteContextGraph<
IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
template <>
-struct DenseMapInfo<IndexCall>
+struct llvm::DenseMapInfo<IndexCall>
: public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
-} // end namespace llvm
namespace {
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index ac41fdd..2d5cb82 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -372,9 +372,7 @@ struct VTableSlot {
} // end anonymous namespace
-namespace llvm {
-
-template <> struct DenseMapInfo<VTableSlot> {
+template <> struct llvm::DenseMapInfo<VTableSlot> {
static VTableSlot getEmptyKey() {
return {DenseMapInfo<Metadata *>::getEmptyKey(),
DenseMapInfo<uint64_t>::getEmptyKey()};
@@ -393,7 +391,7 @@ template <> struct DenseMapInfo<VTableSlot> {
}
};
-template <> struct DenseMapInfo<VTableSlotSummary> {
+template <> struct llvm::DenseMapInfo<VTableSlotSummary> {
static VTableSlotSummary getEmptyKey() {
return {DenseMapInfo<StringRef>::getEmptyKey(),
DenseMapInfo<uint64_t>::getEmptyKey()};
@@ -412,8 +410,6 @@ template <> struct DenseMapInfo<VTableSlotSummary> {
}
};
-} // end namespace llvm
-
// Returns true if the function must be unreachable based on ValueInfo.
//
// In particular, identifies a function as unreachable in the following
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 9b272c4..3ddf182 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -28,6 +28,10 @@ using namespace PatternMatch;
#define DEBUG_TYPE "instcombine"
+namespace llvm {
+extern cl::opt<bool> ProfcheckDisableMetadataFixes;
+}
+
/// This is the complement of getICmpCode, which turns an opcode and two
/// operands into either a constant true or false, or a brand new ICmp
/// instruction. The sign is passed in to determine which kind of predicate to
@@ -1272,7 +1276,8 @@ Value *InstCombinerImpl::foldEqOfParts(Value *Cmp0, Value *Cmp1, bool IsAnd) {
static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
bool IsAnd, bool IsLogical,
InstCombiner::BuilderTy &Builder,
- const SimplifyQuery &Q) {
+ const SimplifyQuery &Q,
+ Instruction &I) {
// Match an equality compare with a non-poison constant as Cmp0.
// Also, give up if the compare can be constant-folded to avoid looping.
CmpPredicate Pred0;
@@ -1306,9 +1311,12 @@ static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
return nullptr;
SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
}
- if (IsLogical)
- return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp)
- : Builder.CreateLogicalOr(Cmp0, SubstituteCmp);
+ if (IsLogical) {
+ Instruction *MDFrom =
+ ProfcheckDisableMetadataFixes && isa<SelectInst>(I) ? nullptr : &I;
+ return IsAnd ? Builder.CreateLogicalAnd(Cmp0, SubstituteCmp, "", MDFrom)
+ : Builder.CreateLogicalOr(Cmp0, SubstituteCmp, "", MDFrom);
+ }
return Builder.CreateBinOp(IsAnd ? Instruction::And : Instruction::Or, Cmp0,
SubstituteCmp);
}
@@ -3396,13 +3404,13 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
/*IsLogical*/ false, Builder))
return V;
- if (Value *V =
- foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical, Builder, Q))
+ if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, IsAnd, IsLogical,
+ Builder, Q, I))
return V;
// We can convert this case to bitwise and, because both operands are used
// on the LHS, and as such poison from both will propagate.
- if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd,
- /*IsLogical=*/false, Builder, Q)) {
+ if (Value *V = foldAndOrOfICmpsWithConstEq(
+ RHS, LHS, IsAnd, /*IsLogical=*/false, Builder, Q, I)) {
// If RHS is still used, we should drop samesign flag.
if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) {
RHS->setSameSign(false);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 56194fe..4c9b10a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2202,6 +2202,11 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
return commonCastTransforms(CI);
}
+Instruction *InstCombinerImpl::visitPtrToAddr(PtrToAddrInst &CI) {
+ // FIXME: Implement variants of ptrtoint folds.
+ return commonCastTransforms(CI);
+}
+
/// This input value (which is known to have vector type) is being zero extended
/// or truncated to the specified vector type. Since the zext/trunc is done
/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index e01c145..218aaf9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -143,6 +143,7 @@ public:
Instruction *visitUIToFP(CastInst &CI);
Instruction *visitSIToFP(CastInst &CI);
Instruction *visitPtrToInt(PtrToIntInst &CI);
+ Instruction *visitPtrToAddr(PtrToAddrInst &CI);
Instruction *visitIntToPtr(IntToPtrInst &CI);
Instruction *visitBitCast(BitCastInst &CI);
Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 5c747bb..9815644 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1069,27 +1069,22 @@ struct LoweredPHIRecord {
};
} // namespace
-namespace llvm {
- template<>
- struct DenseMapInfo<LoweredPHIRecord> {
- static inline LoweredPHIRecord getEmptyKey() {
- return LoweredPHIRecord(nullptr, 0);
- }
- static inline LoweredPHIRecord getTombstoneKey() {
- return LoweredPHIRecord(nullptr, 1);
- }
- static unsigned getHashValue(const LoweredPHIRecord &Val) {
- return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
- (Val.Width>>3);
- }
- static bool isEqual(const LoweredPHIRecord &LHS,
- const LoweredPHIRecord &RHS) {
- return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
- LHS.Width == RHS.Width;
- }
- };
-} // namespace llvm
-
+template <> struct llvm::DenseMapInfo<LoweredPHIRecord> {
+ static inline LoweredPHIRecord getEmptyKey() {
+ return LoweredPHIRecord(nullptr, 0);
+ }
+ static inline LoweredPHIRecord getTombstoneKey() {
+ return LoweredPHIRecord(nullptr, 1);
+ }
+ static unsigned getHashValue(const LoweredPHIRecord &Val) {
+ return DenseMapInfo<PHINode *>::getHashValue(Val.PN) ^ (Val.Shift >> 3) ^
+ (Val.Width >> 3);
+ }
+ static bool isEqual(const LoweredPHIRecord &LHS,
+ const LoweredPHIRecord &RHS) {
+ return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift && LHS.Width == RHS.Width;
+ }
+};
/// This is an integer PHI and we know that it has an illegal type: see if it is
/// only used by trunc or trunc(lshr) operations. If so, we split the PHI into
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 3704ad7..860f8f7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -600,9 +600,7 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
!IsRISCV64 && !IsLoongArch64 &&
!(Mapping.Offset & (Mapping.Offset - 1)) &&
Mapping.Offset != kDynamicShadowSentinel;
- bool IsAndroidWithIfuncSupport =
- IsAndroid && !TargetTriple.isAndroidVersionLT(21);
- Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb;
+ Mapping.InGlobal = ClWithIfunc && IsAndroid && IsArmOrThumb;
return Mapping;
}
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 3f7003d..e5935f4 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -148,19 +148,16 @@ public:
class DFAJumpThreading {
public:
- DFAJumpThreading(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+ DFAJumpThreading(AssumptionCache *AC, DomTreeUpdater *DTU, LoopInfo *LI,
TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE)
- : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {}
+ : AC(AC), DTU(DTU), LI(LI), TTI(TTI), ORE(ORE) {}
bool run(Function &F);
bool LoopInfoBroken;
private:
void
- unfoldSelectInstrs(DominatorTree *DT,
- const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
- // TODO: Have everything use a single lazy DTU
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ unfoldSelectInstrs(const SmallVector<SelectInstToUnfold, 4> &SelectInsts) {
SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts);
while (!Stack.empty()) {
@@ -168,7 +165,7 @@ private:
std::vector<SelectInstToUnfold> NewSIsToUnfold;
std::vector<BasicBlock *> NewBBs;
- unfold(&DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs);
+ unfold(DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs);
// Put newly discovered select instructions into the work list.
llvm::append_range(Stack, NewSIsToUnfold);
@@ -181,7 +178,7 @@ private:
std::vector<BasicBlock *> *NewBBs);
AssumptionCache *AC;
- DominatorTree *DT;
+ DomTreeUpdater *DTU;
LoopInfo *LI;
TargetTransformInfo *TTI;
OptimizationRemarkEmitter *ORE;
@@ -389,6 +386,22 @@ inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) {
return OS;
}
+/// Helper to get the successor corresponding to a particular case value for
+/// a switch statement.
+static BasicBlock *getNextCaseSuccessor(SwitchInst *Switch,
+ const APInt &NextState) {
+ BasicBlock *NextCase = nullptr;
+ for (auto Case : Switch->cases()) {
+ if (Case.getCaseValue()->getValue() == NextState) {
+ NextCase = Case.getCaseSuccessor();
+ break;
+ }
+ }
+ if (!NextCase)
+ NextCase = Switch->getDefaultDest();
+ return NextCase;
+}
+
namespace {
/// ThreadingPath is a path in the control flow of a loop that can be threaded
/// by cloning necessary basic blocks and replacing conditional branches with
@@ -401,6 +414,10 @@ struct ThreadingPath {
ExitVal = V->getValue();
IsExitValSet = true;
}
+ void setExitValue(const APInt &V) {
+ ExitVal = V;
+ IsExitValSet = true;
+ }
bool isExitValueSet() const { return IsExitValSet; }
/// Determinator is the basic block that determines the next state of the DFA.
@@ -583,44 +600,8 @@ struct AllSwitchPaths {
BasicBlock *getSwitchBlock() { return SwitchBlock; }
void run() {
- StateDefMap StateDef = getStateDefMap();
- if (StateDef.empty()) {
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable",
- Switch)
- << "Switch instruction is not predictable.";
- });
- return;
- }
-
- auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0));
- auto *SwitchPhiDefBB = SwitchPhi->getParent();
- VisitedBlocks VB;
- // Get paths from the determinator BBs to SwitchPhiDefBB
- std::vector<ThreadingPath> PathsToPhiDef =
- getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
- if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
- TPaths = std::move(PathsToPhiDef);
- return;
- }
-
- assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
- auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
- // Find and append paths from SwitchPhiDefBB to SwitchBlock.
- PathsType PathsToSwitchBB =
- paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
- if (PathsToSwitchBB.empty())
- return;
-
- std::vector<ThreadingPath> TempList;
- for (const ThreadingPath &Path : PathsToPhiDef) {
- for (const PathType &PathToSw : PathsToSwitchBB) {
- ThreadingPath PathCopy(Path);
- PathCopy.appendExcludingFirst(PathToSw);
- TempList.push_back(PathCopy);
- }
- }
- TPaths = std::move(TempList);
+ findTPaths();
+ unifyTPaths();
}
private:
@@ -812,6 +793,69 @@ private:
return Res;
}
+ // Find all threadable paths.
+ void findTPaths() {
+ StateDefMap StateDef = getStateDefMap();
+ if (StateDef.empty()) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable",
+ Switch)
+ << "Switch instruction is not predictable.";
+ });
+ return;
+ }
+
+ auto *SwitchPhi = cast<PHINode>(Switch->getOperand(0));
+ auto *SwitchPhiDefBB = SwitchPhi->getParent();
+ VisitedBlocks VB;
+ // Get paths from the determinator BBs to SwitchPhiDefBB
+ std::vector<ThreadingPath> PathsToPhiDef =
+ getPathsFromStateDefMap(StateDef, SwitchPhi, VB, MaxNumPaths);
+ if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) {
+ TPaths = std::move(PathsToPhiDef);
+ return;
+ }
+
+ assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty());
+ auto PathsLimit = MaxNumPaths / PathsToPhiDef.size();
+ // Find and append paths from SwitchPhiDefBB to SwitchBlock.
+ PathsType PathsToSwitchBB =
+ paths(SwitchPhiDefBB, SwitchBlock, VB, /* PathDepth = */ 1, PathsLimit);
+ if (PathsToSwitchBB.empty())
+ return;
+
+ std::vector<ThreadingPath> TempList;
+ for (const ThreadingPath &Path : PathsToPhiDef) {
+ for (const PathType &PathToSw : PathsToSwitchBB) {
+ ThreadingPath PathCopy(Path);
+ PathCopy.appendExcludingFirst(PathToSw);
+ TempList.push_back(PathCopy);
+ }
+ }
+ TPaths = std::move(TempList);
+ }
+
+ // Two states are equivalent if they have the same switch destination.
+ // Unify the states in different threading path if the states are equivalent.
+ void unifyTPaths() {
+ llvm::SmallDenseMap<BasicBlock *, APInt> DestToState;
+ for (ThreadingPath &Path : TPaths) {
+ APInt NextState = Path.getExitValue();
+ BasicBlock *Dest = getNextCaseSuccessor(Switch, NextState);
+ auto StateIt = DestToState.find(Dest);
+ if (StateIt == DestToState.end()) {
+ DestToState.insert({Dest, NextState});
+ continue;
+ }
+
+ if (NextState != StateIt->second) {
+ LLVM_DEBUG(dbgs() << "Next state in " << Path << " is equivalent to "
+ << StateIt->second << "\n");
+ Path.setExitValue(StateIt->second);
+ }
+ }
+ }
+
unsigned NumVisited = 0;
SwitchInst *Switch;
BasicBlock *SwitchBlock;
@@ -822,11 +866,11 @@ private:
};
struct TransformDFA {
- TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT,
+ TransformDFA(AllSwitchPaths *SwitchPaths, DomTreeUpdater *DTU,
AssumptionCache *AC, TargetTransformInfo *TTI,
OptimizationRemarkEmitter *ORE,
SmallPtrSet<const Value *, 32> EphValues)
- : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE),
+ : SwitchPaths(SwitchPaths), DTU(DTU), AC(AC), TTI(TTI), ORE(ORE),
EphValues(EphValues) {}
bool run() {
@@ -1002,19 +1046,16 @@ private:
SmallPtrSet<BasicBlock *, 16> BlocksToClean;
BlocksToClean.insert_range(successors(SwitchBlock));
- {
- DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
- for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
- createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, &DTU);
- NumPaths++;
- }
-
- // After all paths are cloned, now update the last successor of the cloned
- // path so it skips over the switch statement
- for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
- updateLastSuccessor(TPath, DuplicateMap, &DTU);
+ for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) {
+ createExitPath(NewDefs, TPath, DuplicateMap, BlocksToClean, DTU);
+ NumPaths++;
}
+ // After all paths are cloned, now update the last successor of the cloned
+ // path so it skips over the switch statement
+ for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths())
+ updateLastSuccessor(TPath, DuplicateMap, DTU);
+
// For each instruction that was cloned and used outside, update its uses
updateSSA(NewDefs);
@@ -1118,7 +1159,7 @@ private:
}
// SSAUpdater handles phi placement and renaming uses with the appropriate
// value.
- SSAUpdate.RewriteAllUses(DT);
+ SSAUpdate.RewriteAllUses(&DTU->getDomTree());
}
/// Clones a basic block, and adds it to the CFG.
@@ -1335,28 +1376,13 @@ private:
return It != ClonedBBs.end() ? (*It).BB : nullptr;
}
- /// Helper to get the successor corresponding to a particular case value for
- /// a switch statement.
- BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, const APInt &NextState) {
- BasicBlock *NextCase = nullptr;
- for (auto Case : Switch->cases()) {
- if (Case.getCaseValue()->getValue() == NextState) {
- NextCase = Case.getCaseSuccessor();
- break;
- }
- }
- if (!NextCase)
- NextCase = Switch->getDefaultDest();
- return NextCase;
- }
-
/// Returns true if IncomingBB is a predecessor of BB.
bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) {
return llvm::is_contained(predecessors(BB), IncomingBB);
}
AllSwitchPaths *SwitchPaths;
- DominatorTree *DT;
+ DomTreeUpdater *DTU;
AssumptionCache *AC;
TargetTransformInfo *TTI;
OptimizationRemarkEmitter *ORE;
@@ -1399,7 +1425,7 @@ bool DFAJumpThreading::run(Function &F) {
<< "candidate for jump threading\n");
LLVM_DEBUG(SI->dump());
- unfoldSelectInstrs(DT, Switch.getSelectInsts());
+ unfoldSelectInstrs(Switch.getSelectInsts());
if (!Switch.getSelectInsts().empty())
MadeChanges = true;
@@ -1421,7 +1447,7 @@ bool DFAJumpThreading::run(Function &F) {
}
#ifdef NDEBUG
- LI->verify(*DT);
+ LI->verify(DTU->getDomTree());
#endif
SmallPtrSet<const Value *, 32> EphValues;
@@ -1429,13 +1455,15 @@ bool DFAJumpThreading::run(Function &F) {
CodeMetrics::collectEphemeralValues(&F, AC, EphValues);
for (AllSwitchPaths SwitchPaths : ThreadableLoops) {
- TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues);
+ TransformDFA Transform(&SwitchPaths, DTU, AC, TTI, ORE, EphValues);
if (Transform.run())
MadeChanges = LoopInfoBroken = true;
}
+ DTU->flush();
+
#ifdef EXPENSIVE_CHECKS
- assert(DT->verify(DominatorTree::VerificationLevel::Full));
+ assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full));
verifyFunction(F, &dbgs());
#endif
@@ -1450,7 +1478,9 @@ PreservedAnalyses DFAJumpThreadingPass::run(Function &F,
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
OptimizationRemarkEmitter ORE(&F);
- DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE);
+
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ DFAJumpThreading ThreadImpl(&AC, &DTU, &LI, &TTI, &ORE);
if (!ThreadImpl.run(F))
return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 0f8cc6c..2afa7b7 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -108,7 +108,7 @@ struct SimpleValue {
// of instruction handled below (UnaryOperator, etc.).
if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
if (Function *F = CI->getCalledFunction()) {
- switch ((Intrinsic::ID)F->getIntrinsicID()) {
+ switch (F->getIntrinsicID()) {
case Intrinsic::experimental_constrained_fadd:
case Intrinsic::experimental_constrained_fsub:
case Intrinsic::experimental_constrained_fmul:
@@ -154,9 +154,7 @@ struct SimpleValue {
} // end anonymous namespace
-namespace llvm {
-
-template <> struct DenseMapInfo<SimpleValue> {
+template <> struct llvm::DenseMapInfo<SimpleValue> {
static inline SimpleValue getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
@@ -169,8 +167,6 @@ template <> struct DenseMapInfo<SimpleValue> {
static bool isEqual(SimpleValue LHS, SimpleValue RHS);
};
-} // end namespace llvm
-
/// Match a 'select' including an optional 'not's of the condition.
static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
Value *&B,
@@ -509,9 +505,7 @@ struct CallValue {
} // end anonymous namespace
-namespace llvm {
-
-template <> struct DenseMapInfo<CallValue> {
+template <> struct llvm::DenseMapInfo<CallValue> {
static inline CallValue getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
@@ -524,8 +518,6 @@ template <> struct DenseMapInfo<CallValue> {
static bool isEqual(CallValue LHS, CallValue RHS);
};
-} // end namespace llvm
-
unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
Instruction *Inst = Val.Inst;
@@ -580,9 +572,7 @@ struct GEPValue {
} // namespace
-namespace llvm {
-
-template <> struct DenseMapInfo<GEPValue> {
+template <> struct llvm::DenseMapInfo<GEPValue> {
static inline GEPValue getEmptyKey() {
return DenseMapInfo<Instruction *>::getEmptyKey();
}
@@ -595,8 +585,6 @@ template <> struct DenseMapInfo<GEPValue> {
static bool isEqual(const GEPValue &LHS, const GEPValue &RHS);
};
-} // end namespace llvm
-
unsigned DenseMapInfo<GEPValue>::getHashValue(const GEPValue &Val) {
auto *GEP = cast<GetElementPtrInst>(Val.Inst);
if (Val.ConstantOffset.has_value())
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 638952a..3a8ade8 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -170,9 +170,7 @@ struct llvm::GVNPass::Expression {
}
};
-namespace llvm {
-
-template <> struct DenseMapInfo<GVNPass::Expression> {
+template <> struct llvm::DenseMapInfo<GVNPass::Expression> {
static inline GVNPass::Expression getEmptyKey() { return ~0U; }
static inline GVNPass::Expression getTombstoneKey() { return ~1U; }
@@ -188,8 +186,6 @@ template <> struct DenseMapInfo<GVNPass::Expression> {
}
};
-} // end namespace llvm
-
/// Represents a particular available value that we know how to materialize.
/// Materialization of an AvailableValue never fails. An AvailableValue is
/// implicitly associated with a rematerialization point which is the
@@ -2084,13 +2080,6 @@ bool GVNPass::processNonLocalLoad(LoadInst *Load) {
return Changed;
}
-static bool hasUsersIn(Value *V, BasicBlock *BB) {
- return any_of(V->users(), [BB](User *U) {
- auto *I = dyn_cast<Instruction>(U);
- return I && I->getParent() == BB;
- });
-}
-
bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
Value *V = IntrinsicI->getArgOperand(0);
@@ -2149,85 +2138,7 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
}
Constant *True = ConstantInt::getTrue(V->getContext());
- bool Changed = false;
-
- for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
- BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
-
- // This property is only true in dominated successors, propagateEquality
- // will check dominance for us.
- Changed |= propagateEquality(V, True, Edge, false);
- }
-
- // We can replace assume value with true, which covers cases like this:
- // call void @llvm.assume(i1 %cmp)
- // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
- ReplaceOperandsWithMap[V] = True;
-
- // Similarly, after assume(!NotV) we know that NotV == false.
- Value *NotV;
- if (match(V, m_Not(m_Value(NotV))))
- ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext());
-
- // If we find an equality fact, canonicalize all dominated uses in this block
- // to one of the two values. We heuristically choice the "oldest" of the
- // two where age is determined by value number. (Note that propagateEquality
- // above handles the cross block case.)
- //
- // Key case to cover are:
- // 1)
- // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
- // call void @llvm.assume(i1 %cmp)
- // ret float %0 ; will change it to ret float 3.000000e+00
- // 2)
- // %load = load float, float* %addr
- // %cmp = fcmp oeq float %load, %0
- // call void @llvm.assume(i1 %cmp)
- // ret float %load ; will change it to ret float %0
- if (auto *CmpI = dyn_cast<CmpInst>(V)) {
- if (CmpI->isEquivalence()) {
- Value *CmpLHS = CmpI->getOperand(0);
- Value *CmpRHS = CmpI->getOperand(1);
- // Heuristically pick the better replacement -- the choice of heuristic
- // isn't terribly important here, but the fact we canonicalize on some
- // replacement is for exposing other simplifications.
- // TODO: pull this out as a helper function and reuse w/ existing
- // (slightly different) logic.
- if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
- std::swap(CmpLHS, CmpRHS);
- if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
- std::swap(CmpLHS, CmpRHS);
- if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
- (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
- // Move the 'oldest' value to the right-hand side, using the value
- // number as a proxy for age.
- uint32_t LVN = VN.lookupOrAdd(CmpLHS);
- uint32_t RVN = VN.lookupOrAdd(CmpRHS);
- if (LVN < RVN)
- std::swap(CmpLHS, CmpRHS);
- }
-
- // Handle degenerate case where we either haven't pruned a dead path or a
- // removed a trivial assume yet.
- if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
- return Changed;
-
- LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
- << *CmpLHS << " with "
- << *CmpRHS << " in block "
- << IntrinsicI->getParent()->getName() << "\n");
-
- // Setup the replacement map - this handles uses within the same block.
- if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
- ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
-
- // NOTE: The non-block local cases are handled by the call to
- // propagateEquality above; this block is just about handling the block
- // local cases. TODO: There's a bunch of logic in propagateEqualiy which
- // isn't duplicated for the block local case, can we share it somehow?
- }
- }
- return Changed;
+ return propagateEquality(V, True, IntrinsicI);
}
static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
@@ -2526,39 +2437,28 @@ void GVNPass::assignBlockRPONumber(Function &F) {
InvalidBlockRPONumbers = false;
}
-bool GVNPass::replaceOperandsForInBlockEquality(Instruction *Instr) const {
- bool Changed = false;
- for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
- Use &Operand = Instr->getOperandUse(OpNum);
- auto It = ReplaceOperandsWithMap.find(Operand.get());
- if (It != ReplaceOperandsWithMap.end()) {
- const DataLayout &DL = Instr->getDataLayout();
- if (!canReplacePointersInUseIfEqual(Operand, It->second, DL))
- continue;
-
- LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
- << *It->second << " in instruction " << *Instr << '\n');
- Instr->setOperand(OpNum, It->second);
- Changed = true;
- }
- }
- return Changed;
-}
-
-/// The given values are known to be equal in every block
+/// The given values are known to be equal in every use
/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with
/// 'RHS' everywhere in the scope. Returns whether a change was made.
-/// If DominatesByEdge is false, then it means that we will propagate the RHS
-/// value starting from the end of Root.Start.
-bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
- const BasicBlockEdge &Root,
- bool DominatesByEdge) {
+/// The Root may either be a basic block edge (for conditions) or an
+/// instruction (for assumes).
+bool GVNPass::propagateEquality(
+ Value *LHS, Value *RHS,
+ const std::variant<BasicBlockEdge, Instruction *> &Root) {
SmallVector<std::pair<Value*, Value*>, 4> Worklist;
Worklist.push_back(std::make_pair(LHS, RHS));
bool Changed = false;
- // For speed, compute a conservative fast approximation to
- // DT->dominates(Root, Root.getEnd());
- const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+ SmallVector<const BasicBlock *> DominatedBlocks;
+ if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) {
+ // For speed, compute a conservative fast approximation to
+ // DT->dominates(Root, Root.getEnd());
+ if (isOnlyReachableViaThisEdge(*Edge, DT))
+ DominatedBlocks.push_back(Edge->getEnd());
+ } else {
+ Instruction *I = std::get<Instruction *>(Root);
+ for (const auto *Node : DT->getNode(I->getParent())->children())
+ DominatedBlocks.push_back(Node->getBlock());
+ }
while (!Worklist.empty()) {
std::pair<Value*, Value*> Item = Worklist.pop_back_val();
@@ -2606,9 +2506,9 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
// using the leader table is about compiling faster, not optimizing better).
// The leader table only tracks basic blocks, not edges. Only add to if we
// have the simple case where the edge dominates the end.
- if (RootDominatesEnd && !isa<Instruction>(RHS) &&
- canReplacePointersIfEqual(LHS, RHS, DL))
- LeaderTable.insert(LVN, RHS, Root.getEnd());
+ if (!isa<Instruction>(RHS) && canReplacePointersIfEqual(LHS, RHS, DL))
+ for (const BasicBlock *BB : DominatedBlocks)
+ LeaderTable.insert(LVN, RHS, BB);
// Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As
// LHS always has at least one use that is not dominated by Root, this will
@@ -2618,12 +2518,14 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
auto CanReplacePointersCallBack = [&DL](const Use &U, const Value *To) {
return canReplacePointersInUseIfEqual(U, To, DL);
};
- unsigned NumReplacements =
- DominatesByEdge
- ? replaceDominatedUsesWithIf(LHS, RHS, *DT, Root,
- CanReplacePointersCallBack)
- : replaceDominatedUsesWithIf(LHS, RHS, *DT, Root.getStart(),
- CanReplacePointersCallBack);
+ unsigned NumReplacements;
+ if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root))
+ NumReplacements = replaceDominatedUsesWithIf(
+ LHS, RHS, *DT, *Edge, CanReplacePointersCallBack);
+ else
+ NumReplacements = replaceDominatedUsesWithIf(
+ LHS, RHS, *DT, std::get<Instruction *>(Root),
+ CanReplacePointersCallBack);
if (NumReplacements > 0) {
Changed = true;
@@ -2682,26 +2584,45 @@ bool GVNPass::propagateEquality(Value *LHS, Value *RHS,
// If the number we were assigned was brand new then there is no point in
// looking for an instruction realizing it: there cannot be one!
if (Num < NextNum) {
- Value *NotCmp = findLeader(Root.getEnd(), Num);
- if (NotCmp && isa<Instruction>(NotCmp)) {
- unsigned NumReplacements =
- DominatesByEdge
- ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
- : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
- Root.getStart());
- Changed |= NumReplacements > 0;
- NumGVNEqProp += NumReplacements;
- // Cached information for anything that uses NotCmp will be invalid.
- if (MD)
- MD->invalidateCachedPointerInfo(NotCmp);
+ for (const auto &Entry : LeaderTable.getLeaders(Num)) {
+ // Only look at leaders that either dominate the start of the edge,
+ // or are dominated by the end. This check is not necessary for
+ // correctness, it only discards cases for which the following
+ // use replacement will not work anyway.
+ if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root)) {
+ if (!DT->dominates(Entry.BB, Edge->getStart()) &&
+ !DT->dominates(Edge->getEnd(), Entry.BB))
+ continue;
+ } else {
+ auto *InstBB = std::get<Instruction *>(Root)->getParent();
+ if (!DT->dominates(Entry.BB, InstBB) &&
+ !DT->dominates(InstBB, Entry.BB))
+ continue;
+ }
+
+ Value *NotCmp = Entry.Val;
+ if (NotCmp && isa<Instruction>(NotCmp)) {
+ unsigned NumReplacements;
+ if (const BasicBlockEdge *Edge = std::get_if<BasicBlockEdge>(&Root))
+ NumReplacements =
+ replaceDominatedUsesWith(NotCmp, NotVal, *DT, *Edge);
+ else
+ NumReplacements = replaceDominatedUsesWith(
+ NotCmp, NotVal, *DT, std::get<Instruction *>(Root));
+ Changed |= NumReplacements > 0;
+ NumGVNEqProp += NumReplacements;
+ // Cached information for anything that uses NotCmp will be invalid.
+ if (MD)
+ MD->invalidateCachedPointerInfo(NotCmp);
+ }
}
}
// Ensure that any instruction in scope that gets the "A < B" value number
// is replaced with false.
// The leader table only tracks basic blocks, not edges. Only add to if we
// have the simple case where the edge dominates the end.
- if (RootDominatesEnd)
- LeaderTable.insert(Num, NotVal, Root.getEnd());
+ for (const BasicBlock *BB : DominatedBlocks)
+ LeaderTable.insert(Num, NotVal, BB);
continue;
}
@@ -2789,11 +2710,11 @@ bool GVNPass::processInstruction(Instruction *I) {
Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
BasicBlockEdge TrueE(Parent, TrueSucc);
- Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+ Changed |= propagateEquality(BranchCond, TrueVal, TrueE);
Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
BasicBlockEdge FalseE(Parent, FalseSucc);
- Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+ Changed |= propagateEquality(BranchCond, FalseVal, FalseE);
return Changed;
}
@@ -2814,7 +2735,7 @@ bool GVNPass::processInstruction(Instruction *I) {
// If there is only a single edge, propagate the case value into it.
if (SwitchEdges.lookup(Dst) == 1) {
BasicBlockEdge E(Parent, Dst);
- Changed |= propagateEquality(SwitchCond, Case.getCaseValue(), E, true);
+ Changed |= propagateEquality(SwitchCond, Case.getCaseValue(), E);
}
}
return Changed;
@@ -2942,8 +2863,6 @@ bool GVNPass::processBlock(BasicBlock *BB) {
if (DeadBlocks.count(BB))
return false;
- // Clearing map before every BB because it can be used only for single BB.
- ReplaceOperandsWithMap.clear();
bool ChangedFunction = false;
// Since we may not have visited the input blocks of the phis, we can't
@@ -2955,11 +2874,8 @@ bool GVNPass::processBlock(BasicBlock *BB) {
for (PHINode *PN : PHINodesToRemove) {
removeInstruction(PN);
}
- for (Instruction &Inst : make_early_inc_range(*BB)) {
- if (!ReplaceOperandsWithMap.empty())
- ChangedFunction |= replaceOperandsForInBlockEquality(&Inst);
+ for (Instruction &Inst : make_early_inc_range(*BB))
ChangedFunction |= processInstruction(&Inst);
- }
return ChangedFunction;
}
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 3c1a8ba..80aa98d 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -434,10 +434,6 @@ private:
int StoreCount = 0;
};
-} // end anonymous namespace
-
-namespace llvm {
-
struct ExactEqualsExpression {
const Expression &E;
@@ -449,8 +445,9 @@ struct ExactEqualsExpression {
return E.exactlyEquals(Other);
}
};
+} // end anonymous namespace
-template <> struct DenseMapInfo<const Expression *> {
+template <> struct llvm::DenseMapInfo<const Expression *> {
static const Expression *getEmptyKey() {
auto Val = static_cast<uintptr_t>(-1);
Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
@@ -493,8 +490,6 @@ template <> struct DenseMapInfo<const Expression *> {
}
};
-} // end namespace llvm
-
namespace {
class NewGVN {
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 2190dcd..a87822c 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -84,10 +84,6 @@ public:
bool run();
};
-} // anonymous namespace
-
-namespace llvm {
-
struct FrozenIndPHIInfo {
// A freeze instruction that uses an induction phi
FreezeInst *FI = nullptr;
@@ -103,7 +99,9 @@ struct FrozenIndPHIInfo {
bool operator==(const FrozenIndPHIInfo &Other) { return FI == Other.FI; }
};
-template <> struct DenseMapInfo<FrozenIndPHIInfo> {
+} // namespace
+
+template <> struct llvm::DenseMapInfo<FrozenIndPHIInfo> {
static inline FrozenIndPHIInfo getEmptyKey() {
return FrozenIndPHIInfo(DenseMapInfo<PHINode *>::getEmptyKey(),
DenseMapInfo<BinaryOperator *>::getEmptyKey());
@@ -124,8 +122,6 @@ template <> struct DenseMapInfo<FrozenIndPHIInfo> {
};
};
-} // end namespace llvm
-
// Given U = (value, user), replace value with freeze(value), and let
// SCEV forget user. The inserted freeze is placed in the preheader.
void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index b6ca52e..46f2903 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3246,6 +3246,13 @@ unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
return ::replaceDominatedUsesWith(From, To, Dominates);
}
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+ DominatorTree &DT,
+ const Instruction *I) {
+ auto Dominates = [&](const Use &U) { return DT.dominates(I, U); };
+ return ::replaceDominatedUsesWith(From, To, Dominates);
+}
+
unsigned llvm::replaceDominatedUsesWithIf(
Value *From, Value *To, DominatorTree &DT, const BasicBlockEdge &Root,
function_ref<bool(const Use &U, const Value *To)> ShouldReplace) {
@@ -3264,6 +3271,15 @@ unsigned llvm::replaceDominatedUsesWithIf(
return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace);
}
+unsigned llvm::replaceDominatedUsesWithIf(
+ Value *From, Value *To, DominatorTree &DT, const Instruction *I,
+ function_ref<bool(const Use &U, const Value *To)> ShouldReplace) {
+ auto DominatesAndShouldReplace = [&](const Use &U) {
+ return DT.dominates(I, U) && ShouldReplace(U, To);
+ };
+ return ::replaceDominatedUsesWith(From, To, DominatesAndShouldReplace);
+}
+
bool llvm::callsGCLeafFunction(const CallBase *Call,
const TargetLibraryInfo &TLI) {
// Check if the function is specifically marked as a gc leaf function.
diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index ff2ab3c..cecb662 100644
--- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -27,15 +27,15 @@ using namespace llvm;
STATISTIC(NumInvokes, "Number of invokes replaced");
namespace {
- class LowerInvokeLegacyPass : public FunctionPass {
- public:
- static char ID; // Pass identification, replacement for typeid
- explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
- initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
- };
-}
+class LowerInvokeLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
+ initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
char LowerInvokeLegacyPass::ID = 0;
INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke",
@@ -78,11 +78,12 @@ bool LowerInvokeLegacyPass::runOnFunction(Function &F) {
return runImpl(F);
}
-namespace llvm {
-char &LowerInvokePassID = LowerInvokeLegacyPass::ID;
+char &llvm::LowerInvokePassID = LowerInvokeLegacyPass::ID;
// Public Interface To the LowerInvoke pass.
-FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); }
+FunctionPass *llvm::createLowerInvokePass() {
+ return new LowerInvokeLegacyPass();
+}
PreservedAnalyses LowerInvokePass::run(Function &F,
FunctionAnalysisManager &AM) {
@@ -92,4 +93,3 @@ PreservedAnalyses LowerInvokePass::run(Function &F,
return PreservedAnalyses::none();
}
-}
diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp
index ca7e09d..1585e9e 100644
--- a/llvm/lib/Transforms/Utils/MisExpect.cpp
+++ b/llvm/lib/Transforms/Utils/MisExpect.cpp
@@ -48,8 +48,6 @@
using namespace llvm;
using namespace misexpect;
-namespace llvm {
-
// Command line option to enable/disable the warning when profile data suggests
// a mismatch with the use of the llvm.expect intrinsic
static cl::opt<bool> PGOWarnMisExpect(
@@ -63,22 +61,18 @@ static cl::opt<uint32_t> MisExpectTolerance(
cl::desc("Prevents emitting diagnostics when profile counts are "
"within N% of the threshold.."));
-} // namespace llvm
-
-namespace {
-
-bool isMisExpectDiagEnabled(LLVMContext &Ctx) {
+static bool isMisExpectDiagEnabled(const LLVMContext &Ctx) {
return PGOWarnMisExpect || Ctx.getMisExpectWarningRequested();
}
-uint32_t getMisExpectTolerance(LLVMContext &Ctx) {
+static uint32_t getMisExpectTolerance(const LLVMContext &Ctx) {
return std::max(static_cast<uint32_t>(MisExpectTolerance),
Ctx.getDiagnosticsMisExpectTolerance());
}
-Instruction *getInstCondition(Instruction *I) {
+static const Instruction *getInstCondition(const Instruction *I) {
assert(I != nullptr && "MisExpect target Instruction cannot be nullptr");
- Instruction *Ret = nullptr;
+ const Instruction *Ret = nullptr;
if (auto *B = dyn_cast<BranchInst>(I)) {
Ret = dyn_cast<Instruction>(B->getCondition());
}
@@ -97,8 +91,8 @@ Instruction *getInstCondition(Instruction *I) {
return Ret ? Ret : I;
}
-void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
- uint64_t ProfCount, uint64_t TotalCount) {
+static void emitMisexpectDiagnostic(const Instruction *I, LLVMContext &Ctx,
+ uint64_t ProfCount, uint64_t TotalCount) {
double PercentageCorrect = (double)ProfCount / TotalCount;
auto PerString =
formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount);
@@ -106,20 +100,16 @@ void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx,
"Potential performance regression from use of the llvm.expect intrinsic: "
"Annotation was correct on {0} of profiled executions.",
PerString);
- Instruction *Cond = getInstCondition(I);
+ const Instruction *Cond = getInstCondition(I);
if (isMisExpectDiagEnabled(Ctx))
Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Twine(PerString)));
OptimizationRemarkEmitter ORE(I->getParent()->getParent());
ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str());
}
-} // namespace
-
-namespace llvm {
-namespace misexpect {
-
-void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
- ArrayRef<uint32_t> ExpectedWeights) {
+void misexpect::verifyMisExpect(const Instruction &I,
+ ArrayRef<uint32_t> RealWeights,
+ ArrayRef<uint32_t> ExpectedWeights) {
// To determine if we emit a diagnostic, we need to compare the branch weights
// from the profile to those added by the llvm.expect intrinsic.
// So first, we extract the "likely" and "unlikely" weights from
@@ -128,15 +118,13 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
uint64_t LikelyBranchWeight = 0,
UnlikelyBranchWeight = std::numeric_limits<uint32_t>::max();
size_t MaxIndex = 0;
- for (size_t Idx = 0, End = ExpectedWeights.size(); Idx < End; Idx++) {
- uint32_t V = ExpectedWeights[Idx];
+ for (const auto &[Idx, V] : enumerate(ExpectedWeights)) {
if (LikelyBranchWeight < V) {
LikelyBranchWeight = V;
MaxIndex = Idx;
}
- if (UnlikelyBranchWeight > V) {
+ if (UnlikelyBranchWeight > V)
UnlikelyBranchWeight = V;
- }
}
const uint64_t ProfiledWeight = RealWeights[MaxIndex];
@@ -161,7 +149,7 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
uint64_t ScaledThreshold = LikelyProbablilty.scale(RealWeightsTotal);
// clamp tolerance range to [0, 100)
- auto Tolerance = getMisExpectTolerance(I.getContext());
+ uint32_t Tolerance = getMisExpectTolerance(I.getContext());
Tolerance = std::clamp(Tolerance, 0u, 99u);
// Allow users to relax checking by N% i.e., if they use a 5% tolerance,
@@ -175,8 +163,8 @@ void verifyMisExpect(Instruction &I, ArrayRef<uint32_t> RealWeights,
RealWeightsTotal);
}
-void checkBackendInstrumentation(Instruction &I,
- const ArrayRef<uint32_t> RealWeights) {
+void misexpect::checkBackendInstrumentation(const Instruction &I,
+ ArrayRef<uint32_t> RealWeights) {
// Backend checking assumes any existing weight comes from an `llvm.expect`
// intrinsic. However, SampleProfiling + ThinLTO add branch weights multiple
// times, leading to an invalid assumption in our checking. Backend checks
@@ -190,24 +178,19 @@ void checkBackendInstrumentation(Instruction &I,
verifyMisExpect(I, RealWeights, ExpectedWeights);
}
-void checkFrontendInstrumentation(Instruction &I,
- const ArrayRef<uint32_t> ExpectedWeights) {
+void misexpect::checkFrontendInstrumentation(
+ const Instruction &I, ArrayRef<uint32_t> ExpectedWeights) {
SmallVector<uint32_t> RealWeights;
if (!extractBranchWeights(I, RealWeights))
return;
verifyMisExpect(I, RealWeights, ExpectedWeights);
}
-void checkExpectAnnotations(Instruction &I,
- const ArrayRef<uint32_t> ExistingWeights,
- bool IsFrontend) {
- if (IsFrontend) {
+void misexpect::checkExpectAnnotations(const Instruction &I,
+ ArrayRef<uint32_t> ExistingWeights,
+ bool IsFrontend) {
+ if (IsFrontend)
checkFrontendInstrumentation(I, ExistingWeights);
- } else {
+ else
checkBackendInstrumentation(I, ExistingWeights);
- }
}
-
-} // namespace misexpect
-} // namespace llvm
-#undef DEBUG_TYPE
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 155fcc5..d831c27 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5959,7 +5959,11 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
unsigned PreviousEdges = OtherCases->size();
if (OtherDest == SI->getDefaultDest())
++PreviousEdges;
- for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+ unsigned E = PreviousEdges - 1;
+ // Remove all incoming values from OtherDest if OtherDest is unreachable.
+ if (NewBI->isUnconditional())
+ ++E;
+ for (unsigned I = 0; I != E; ++I)
cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
}
@@ -7736,8 +7740,7 @@ struct SwitchSuccWrapper {
DenseMap<PHINode *, SmallDenseMap<BasicBlock *, Value *, 8>> *PhiPredIVs;
};
-namespace llvm {
-template <> struct DenseMapInfo<const SwitchSuccWrapper *> {
+template <> struct llvm::DenseMapInfo<const SwitchSuccWrapper *> {
static const SwitchSuccWrapper *getEmptyKey() {
return static_cast<SwitchSuccWrapper *>(
DenseMapInfo<void *>::getEmptyKey());
@@ -7805,7 +7808,6 @@ template <> struct DenseMapInfo<const SwitchSuccWrapper *> {
return true;
}
};
-} // namespace llvm
bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI,
DomTreeUpdater *DTU) {
diff --git a/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
index e67cae7d..8cd29e6 100644
--- a/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
@@ -20,7 +20,7 @@ define void @p2(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
; CHECK-NEXT: Src: store i64 %i.011, ptr %arrayidx8, align 8 --> Dst: store i64 %i.011, ptr %arrayidx8, align 8
; CHECK-NEXT: da analyze - none!
; CHECK-NEXT: Src: store i64 %i.011, ptr %arrayidx8, align 8 --> Dst: %0 = load i64, ptr %arrayidx17, align 8
-; CHECK-NEXT: da analyze - flow [-3 -2]!
+; CHECK-NEXT: da analyze - flow [-3 -2] / assuming 1 loop level(s) fused: [-3 -2 -1]!
; CHECK-NEXT: Src: store i64 %i.011, ptr %arrayidx8, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
; CHECK-NEXT: da analyze - confused!
; CHECK-NEXT: Src: %0 = load i64, ptr %arrayidx17, align 8 --> Dst: %0 = load i64, ptr %arrayidx17, align 8
diff --git a/llvm/test/Analysis/DependenceAnalysis/SameSDLoops.ll b/llvm/test/Analysis/DependenceAnalysis/SameSDLoops.ll
index f80e2cf..57962e0 100644
--- a/llvm/test/Analysis/DependenceAnalysis/SameSDLoops.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/SameSDLoops.ll
@@ -18,7 +18,7 @@ define void @samebd0(ptr %A) nounwind uwtable ssp {
; CHECK-NEXT: Src: store i64 %i.013, ptr %arrayidx12, align 8 --> Dst: store i64 %i.013, ptr %arrayidx12, align 8
; CHECK-NEXT: da analyze - none!
; CHECK-NEXT: Src: store i64 %i.013, ptr %arrayidx12, align 8 --> Dst: store i64 %l17.04, ptr %arrayidx24, align 8
-; CHECK-NEXT: da analyze - output [-4 -3]! / assuming 2 loop level(s) fused: [-4 -3 -3 -1]!
+; CHECK-NEXT: da analyze - output [-4 -3] / assuming 2 loop level(s) fused: [-4 -3 -3 -1]!
; CHECK-NEXT: Src: store i64 %l17.04, ptr %arrayidx24, align 8 --> Dst: store i64 %l17.04, ptr %arrayidx24, align 8
; CHECK-NEXT: da analyze - none!
;
@@ -96,7 +96,7 @@ define void @samebd1(ptr %A) nounwind uwtable ssp {
; CHECK-NEXT: Src: store i64 %i.03, ptr %arrayidx, align 4 --> Dst: store i64 %i.03, ptr %arrayidx, align 4
; CHECK-NEXT: da analyze - none!
; CHECK-NEXT: Src: store i64 %i.03, ptr %arrayidx, align 4 --> Dst: %0 = load i64, ptr %arrayidx7, align 4
-; CHECK-NEXT: da analyze - flow [|<]! / assuming 1 loop level(s) fused: [<=|<]!
+; CHECK-NEXT: da analyze - flow [|<] / assuming 1 loop level(s) fused: [<=|<]!
; CHECK-NEXT: Src: %0 = load i64, ptr %arrayidx7, align 4 --> Dst: %0 = load i64, ptr %arrayidx7, align 4
; CHECK-NEXT: da analyze - none!
;
@@ -148,7 +148,7 @@ define void @non_samebd0(ptr %A) nounwind uwtable ssp {
; CHECK-NEXT: Src: store i64 %i.013, ptr %arrayidx12, align 8 --> Dst: store i64 %i.013, ptr %arrayidx12, align 8
; CHECK-NEXT: da analyze - none!
; CHECK-NEXT: Src: store i64 %i.013, ptr %arrayidx12, align 8 --> Dst: store i64 %l17.04, ptr %arrayidx24, align 8
-; CHECK-NEXT: da analyze - output [-4 -3]!{{$}}
+; CHECK-NEXT: da analyze - output [-4 -3]!
; CHECK-NEXT: Src: store i64 %l17.04, ptr %arrayidx24, align 8 --> Dst: store i64 %l17.04, ptr %arrayidx24, align 8
; CHECK-NEXT: da analyze - none!
;
@@ -227,7 +227,7 @@ define void @non_samebd1(ptr %A) nounwind uwtable ssp {
; CHECK-NEXT: Src: store i64 %i.03, ptr %arrayidx, align 4 --> Dst: store i64 %i.03, ptr %arrayidx, align 4
; CHECK-NEXT: da analyze - none!
; CHECK-NEXT: Src: store i64 %i.03, ptr %arrayidx, align 4 --> Dst: %0 = load i64, ptr %arrayidx7, align 4
-; CHECK-NEXT: da analyze - flow [|<]!{{$}}
+; CHECK-NEXT: da analyze - flow [|<]!
; CHECK-NEXT: Src: %0 = load i64, ptr %arrayidx7, align 4 --> Dst: %0 = load i64, ptr %arrayidx7, align 4
; CHECK-NEXT: da analyze - none!
;
diff --git a/llvm/test/Analysis/IR2Vec/unreachable.ll b/llvm/test/Analysis/IR2Vec/unreachable.ll
index 9be0ee1..627e2c9 100644
--- a/llvm/test/Analysis/IR2Vec/unreachable.ll
+++ b/llvm/test/Analysis/IR2Vec/unreachable.ll
@@ -30,13 +30,17 @@ return: ; preds = %if.else, %if.then
%4 = load i32, ptr %retval, align 4
ret i32 %4
}
-
-; CHECK: Basic block vectors:
+; We'll get individual basic block embeddings for all blocks in the function.
+; But unreachable blocks are not counted for computing the function embedding.
+; CHECK: Function vector: [ 1301.20 1318.20 1335.20 ]
+; CHECK-NEXT: Basic block vectors:
; CHECK-NEXT: Basic block: entry:
; CHECK-NEXT: [ 816.20 825.20 834.20 ]
; CHECK-NEXT: Basic block: if.then:
; CHECK-NEXT: [ 195.00 198.00 201.00 ]
; CHECK-NEXT: Basic block: if.else:
; CHECK-NEXT: [ 195.00 198.00 201.00 ]
+; CHECK-NEXT: Basic block: unreachable:
+; CHECK-NEXT: [ 101.00 103.00 105.00 ]
; CHECK-NEXT: Basic block: return:
; CHECK-NEXT: [ 95.00 97.00 99.00 ]
diff --git a/llvm/test/Assembler/dicompileunit-conflicting-language-fields.ll b/llvm/test/Assembler/dicompileunit-conflicting-language-fields.ll
new file mode 100644
index 0000000..3aad27b
--- /dev/null
+++ b/llvm/test/Assembler/dicompileunit-conflicting-language-fields.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
+
+; CHECK: <stdin>:[[@LINE+1]]:15: error: can only specify one of 'language' and 'sourceLanguageName' on !DICompileUnit
+!0 = distinct !DICompileUnit(language: DW_LANG_C, sourceLanguageName: DW_LNAME_C, file: !DIFile(filename: "a", directory: "b"))
diff --git a/llvm/test/Assembler/dicompileunit-invalid-language.ll b/llvm/test/Assembler/dicompileunit-invalid-language.ll
new file mode 100644
index 0000000..da93c4f
--- /dev/null
+++ b/llvm/test/Assembler/dicompileunit-invalid-language.ll
@@ -0,0 +1,22 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/invalid_dw_lang.ll -disable-output 2>&1 | FileCheck %s --check-prefix=INVALID_DW_LANG
+; RUN: not llvm-as < %t/invalid_dw_lang_2.ll -disable-output 2>&1 | FileCheck %s --check-prefix=INVALID_DW_LANG_2
+; RUN: not llvm-as < %t/invalid_dw_lname.ll -disable-output 2>&1 | FileCheck %s --check-prefix=INVALID_DW_LNAME
+; RUN: not llvm-as < %t/invalid_dw_lname_2.ll -disable-output 2>&1 | FileCheck %s --check-prefix=INVALID_DW_LNAME_2
+
+; INVALID_DW_LANG: invalid DWARF language 'DW_LANG_blah'
+; INVALID_DW_LANG_2: expected DWARF language
+; INVALID_DW_LNAME: invalid DWARF source language name 'DW_LNAME_blah'
+; INVALID_DW_LNAME_2: expected DWARF source language name
+
+;--- invalid_dw_lang.ll
+!0 = distinct !DICompileUnit(language: DW_LANG_blah)
+
+;--- invalid_dw_lang_2.ll
+!0 = distinct !DICompileUnit(language: DW_LNAME_C)
+
+;--- invalid_dw_lname.ll
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_blah)
+
+;--- invalid_dw_lname_2.ll
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LANG_C)
diff --git a/llvm/test/Assembler/invalid-dicompileunit-missing-language.ll b/llvm/test/Assembler/invalid-dicompileunit-missing-language.ll
index 8e4cb02..ebc86e3 100644
--- a/llvm/test/Assembler/invalid-dicompileunit-missing-language.ll
+++ b/llvm/test/Assembler/invalid-dicompileunit-missing-language.ll
@@ -1,4 +1,4 @@
; RUN: not llvm-as < %s -disable-output 2>&1 | FileCheck %s
-; CHECK: <stdin>:[[@LINE+1]]:74: error: missing required field 'language'
+; CHECK: <stdin>:[[@LINE+1]]:15: error: missing one of 'language' or 'sourceLanguageName', required for !DICompileUnit
!0 = distinct !DICompileUnit(file: !DIFile(filename: "a", directory: "b"))
diff --git a/llvm/test/Bitcode/dwarf-source-language-name.ll b/llvm/test/Bitcode/dwarf-source-language-name.ll
new file mode 100644
index 0000000..e989363
--- /dev/null
+++ b/llvm/test/Bitcode/dwarf-source-language-name.ll
@@ -0,0 +1,15 @@
+; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
+
+; CHECK: sourceLanguageName: DW_LNAME_ObjC_plus_plus
+
+source_filename = "cu.cpp"
+target triple = "arm64-apple-macosx"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(sourceLanguageName: DW_LNAME_ObjC_plus_plus, file: !1, producer: "handwritten", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!1 = !DIFile(filename: "cu.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir
new file mode 100644
index 0000000..b9cde95
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-smulh.mir
@@ -0,0 +1,137 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -passes='print<gisel-value-tracking>' -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: Cst
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @Cst
+ ; CHECK-NEXT: %0:_ KnownBits:00010011 SignBits:3
+ ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3
+ ; CHECK-NEXT: %2:_ KnownBits:11111101 SignBits:6
+ %0:_(s8) = G_CONSTANT i8 19
+ %1:_(s8) = G_CONSTANT i8 224
+ %2:_(s8) = G_SMULH %0, %1
+...
+---
+name: CstZero
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @CstZero
+ ; CHECK-NEXT: %0:_ KnownBits:11111111 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 255
+ %1:_(s8) = G_CONSTANT i8 0
+ %2:_(s8) = G_SMULH %0, %1
+...
+---
+name: ScalarVar
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @ScalarVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = COPY $b1
+ %2:_(s8) = G_SMULH %0, %1
+...
+---
+name: ScalarZero
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @ScalarZero
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 0
+ %2:_(s8) = G_SMULH %0, %1
+...
+---
+name: ScalarVarAbs
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @ScalarVarAbs
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_ABS %0
+ %2:_(s16) = G_SEXT %1
+ %3:_(s16) = G_CONSTANT i16 1
+ %4:_(s16) = G_SMULH %2, %3
+...
+---
+name: SplatVecCst
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @SplatVecCst
+ ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 250
+ %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8)
+ %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8)
+ %3:_(<vscale x 16 x s8>) = G_SMULH %1, %2
+...
+---
+name: SplatVecPartScalar
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @SplatVecPartScalar
+ ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %6:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = G_CONSTANT i8 250
+ %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8)
+ %2:_(s8) = G_IMPLICIT_DEF
+ %3:_(s8) = G_CONSTANT i8 15
+ %4:_(s8) = G_AND %2, %3
+ %5:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %4(s8)
+ %6:_(<vscale x 16 x s8>) = G_SMULH %1, %5
+...
+---
+name: VecCst
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @VecCst
+ ; CHECK-NEXT: %0:_ KnownBits:00011001 SignBits:3
+ ; CHECK-NEXT: %1:_ KnownBits:11100001 SignBits:3
+ ; CHECK-NEXT: %2:_ KnownBits:?????001 SignBits:3
+ ; CHECK-NEXT: %3:_ KnownBits:?????001 SignBits:3
+ ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = G_CONSTANT i8 25
+ %1:_(s8) = G_CONSTANT i8 225
+ %2:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8)
+ %3:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8)
+ %4:_(<2 x s8>) = G_SMULH %2, %3
+...
+---
+name: VecPartScalar
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @VecPartScalar
+ ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %6:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = G_CONSTANT i8 250
+ %1:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %0:_(s8)
+ %2:_(s8) = G_IMPLICIT_DEF
+ %3:_(s8) = G_CONSTANT i8 15
+ %4:_(s8) = G_AND %2, %3
+ %5:_(<2 x s8>) = G_BUILD_VECTOR %4:_(s8), %4:_(s8)
+ %6:_(<2 x s8>) = G_SMULH %1, %5
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir
new file mode 100644
index 0000000..debdbaa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-umulh.mir
@@ -0,0 +1,137 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -passes='print<gisel-value-tracking>' -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: Cst
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @Cst
+ ; CHECK-NEXT: %0:_ KnownBits:00010011 SignBits:3
+ ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3
+ ; CHECK-NEXT: %2:_ KnownBits:00010000 SignBits:3
+ %0:_(s8) = G_CONSTANT i8 19
+ %1:_(s8) = G_CONSTANT i8 224
+ %2:_(s8) = G_UMULH %0, %1
+...
+---
+name: CstZero
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @CstZero
+ ; CHECK-NEXT: %0:_ KnownBits:11111111 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 255
+ %1:_(s8) = G_CONSTANT i8 0
+ %2:_(s8) = G_UMULH %0, %1
+...
+---
+name: ScalarVar
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @ScalarVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = COPY $b1
+ %2:_(s8) = G_UMULH %0, %1
+...
+---
+name: ScalarZero
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @ScalarZero
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 0
+ %2:_(s8) = G_UMULH %0, %1
+...
+---
+name: ScalarVarAbs
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @ScalarVarAbs
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:9
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_ABS %0
+ %2:_(s16) = G_SEXT %1
+ %3:_(s16) = G_CONSTANT i16 1
+ %4:_(s16) = G_UMULH %2, %3
+...
+---
+name: SplatVecCst
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @SplatVecCst
+ ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %3:_ KnownBits:11110100 SignBits:4
+ %0:_(s8) = G_CONSTANT i8 250
+ %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8)
+ %2:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8)
+ %3:_(<vscale x 16 x s8>) = G_UMULH %1, %2
+...
+---
+name: SplatVecPartScalar
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @SplatVecPartScalar
+ ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %6:_ KnownBits:0000???? SignBits:4
+ %0:_(s8) = G_CONSTANT i8 250
+ %1:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %0(s8)
+ %2:_(s8) = G_IMPLICIT_DEF
+ %3:_(s8) = G_CONSTANT i8 15
+ %4:_(s8) = G_AND %2, %3
+ %5:_(<vscale x 16 x s8>) = G_SPLAT_VECTOR %4(s8)
+ %6:_(<vscale x 16 x s8>) = G_UMULH %1, %5
+...
+---
+name: VecCst
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @VecCst
+ ; CHECK-NEXT: %0:_ KnownBits:00011001 SignBits:3
+ ; CHECK-NEXT: %1:_ KnownBits:11100001 SignBits:3
+ ; CHECK-NEXT: %2:_ KnownBits:?????001 SignBits:3
+ ; CHECK-NEXT: %3:_ KnownBits:?????001 SignBits:3
+ ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = G_CONSTANT i8 25
+ %1:_(s8) = G_CONSTANT i8 225
+ %2:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8)
+ %3:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %1:_(s8)
+ %4:_(<2 x s8>) = G_UMULH %2, %3
+...
+---
+name: VecPartScalar
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: @VecPartScalar
+ ; CHECK-NEXT: %0:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %1:_ KnownBits:11111010 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %3:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %4:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %5:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %6:_ KnownBits:0000???? SignBits:4
+ %0:_(s8) = G_CONSTANT i8 250
+ %1:_(<2 x s8>) = G_BUILD_VECTOR %0:_(s8), %0:_(s8)
+ %2:_(s8) = G_IMPLICIT_DEF
+ %3:_(s8) = G_CONSTANT i8 15
+ %4:_(s8) = G_AND %2, %3
+ %5:_(<2 x s8>) = G_BUILD_VECTOR %4:_(s8), %4:_(s8)
+ %6:_(<2 x s8>) = G_UMULH %1, %5
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index 322a96a..e8e5631 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -739,14 +739,12 @@ define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) {
;
; GISEL-LABEL: postidx32_shalf:
; GISEL: ; %bb.0:
-; GISEL-NEXT: movi d1, #0000000000000000
-; GISEL-NEXT: ldr h2, [x0], #4
+; GISEL-NEXT: ldr h1, [x0], #4
; GISEL-NEXT: ; kill: def $h0 killed $h0 def $s0
; GISEL-NEXT: fmov w9, s0
-; GISEL-NEXT: fcvt s3, h2
-; GISEL-NEXT: fmov w8, s2
-; GISEL-NEXT: fcvt s1, h1
-; GISEL-NEXT: fcmp s3, s1
+; GISEL-NEXT: fcvt s2, h1
+; GISEL-NEXT: fmov w8, s1
+; GISEL-NEXT: fcmp s2, #0.0
; GISEL-NEXT: csel w8, w8, w9, mi
; GISEL-NEXT: strh w8, [x1]
; GISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index b234ef7..085170c 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -782,18 +782,16 @@ define void @test_fccmp(half %in, ptr %out) {
;
; CHECK-CVT-GI-LABEL: test_fccmp:
; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0
; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-GI-NEXT: fcvt s2, h0
-; CHECK-CVT-GI-NEXT: ldr h1, [x8, :lo12:.LCPI29_0]
-; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_1
-; CHECK-CVT-GI-NEXT: ldr h4, [x8, :lo12:.LCPI29_1]
+; CHECK-CVT-GI-NEXT: fcvt s1, h0
+; CHECK-CVT-GI-NEXT: fmov s2, #5.00000000
+; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0
+; CHECK-CVT-GI-NEXT: fmov s3, #8.00000000
+; CHECK-CVT-GI-NEXT: fcmp s1, s2
+; CHECK-CVT-GI-NEXT: ldr h2, [x8, :lo12:.LCPI29_0]
; CHECK-CVT-GI-NEXT: fmov w8, s0
-; CHECK-CVT-GI-NEXT: fcvt s3, h1
-; CHECK-CVT-GI-NEXT: fmov w9, s1
-; CHECK-CVT-GI-NEXT: fcvt s4, h4
-; CHECK-CVT-GI-NEXT: fcmp s2, s3
-; CHECK-CVT-GI-NEXT: fccmp s2, s4, #4, mi
+; CHECK-CVT-GI-NEXT: fmov w9, s2
+; CHECK-CVT-GI-NEXT: fccmp s1, s3, #4, mi
; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt
; CHECK-CVT-GI-NEXT: strh w8, [x0]
; CHECK-CVT-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
index 7409bfb..743d160 100644
--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -149,33 +149,21 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_f16_i32_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI8_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI8_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI8_0
@@ -189,33 +177,21 @@ define i32 @fcvtzs_f16_i32_7(half %flt) {
}
define i32 @fcvtzs_f16_i32_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI9_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI9_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI9_0
@@ -229,33 +205,21 @@ define i32 @fcvtzs_f16_i32_15(half %flt) {
}
define i64 @fcvtzs_f16_i64_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI10_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI10_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI10_0
@@ -269,33 +233,21 @@ define i64 @fcvtzs_f16_i64_7(half %flt) {
}
define i64 @fcvtzs_f16_i64_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI11_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI11_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI11_0
@@ -453,33 +405,21 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_f16_i32_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI20_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI20_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI20_0
@@ -493,33 +433,21 @@ define i32 @fcvtzu_f16_i32_7(half %flt) {
}
define i32 @fcvtzu_f16_i32_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI21_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI21_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI21_0
@@ -533,33 +461,21 @@ define i32 @fcvtzu_f16_i32_15(half %flt) {
}
define i64 @fcvtzu_f16_i64_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI22_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI22_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI22_0
@@ -573,33 +489,21 @@ define i64 @fcvtzu_f16_i64_7(half %flt) {
}
define i64 @fcvtzu_f16_i64_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI23_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI23_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI23_0
@@ -774,13 +678,11 @@ define half @scvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI32_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI32_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -814,13 +716,11 @@ define half @scvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI33_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -854,13 +754,11 @@ define half @scvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI34_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -894,13 +792,11 @@ define half @scvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI35_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1078,13 +974,11 @@ define half @ucvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI44_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI44_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1118,13 +1012,11 @@ define half @ucvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI45_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI45_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1158,13 +1050,11 @@ define half @ucvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI46_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI46_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1198,13 +1088,11 @@ define half @ucvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI47_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI47_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1356,33 +1244,21 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI55_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI55_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI55_0
@@ -1396,33 +1272,21 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI56_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI56_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI56_0
@@ -1436,33 +1300,21 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI57_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI57_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI57_0
@@ -1476,33 +1328,21 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI58_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI58_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI58_0
@@ -1650,33 +1490,21 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI66_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI66_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI66_0
@@ -1690,33 +1518,21 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI67_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI67_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI67_0
@@ -1730,33 +1546,21 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI68_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI68_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI68_0
@@ -1770,33 +1574,21 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI69_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI69_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI69_0
@@ -1811,4 +1603,3 @@ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
; CHECK-FP16: {{.*}}
-; CHECK-NO16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
index 98276b6..e1bc742 100644
--- a/llvm/test/CodeGen/AArch64/frem-power2.ll
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -100,9 +100,8 @@ define half @hrem2_nsz(half %x) {
; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w30, -16
-; CHECK-GI-NEXT: fmov h1, #2.00000000
; CHECK-GI-NEXT: fcvt s0, h0
-; CHECK-GI-NEXT: fcvt s1, h1
+; CHECK-GI-NEXT: fmov s1, #2.00000000
; CHECK-GI-NEXT: bl fmodf
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll
index 467ceb0..a373004 100644
--- a/llvm/test/CodeGen/AArch64/pr58431.ll
+++ b/llvm/test/CodeGen/AArch64/pr58431.ll
@@ -9,7 +9,7 @@ define i32 @f(i64 %0) {
; CHECK-NEXT: mov w10, #10 // =0xa
; CHECK-NEXT: eor x8, x8, #0x8000000000000003
; CHECK-NEXT: umulh x8, x9, x8
-; CHECK-NEXT: msub x0, x8, x10, x9
+; CHECK-NEXT: umsubl x0, w8, w10, x9
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: ret
%2 = trunc i64 %0 to i32
diff --git a/llvm/test/CodeGen/AArch64/sve-asrd.ll b/llvm/test/CodeGen/AArch64/sve-asrd.ll
new file mode 100644
index 0000000..66db1a5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-asrd.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve -combiner-disabled < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Ensure we don't try to represent sdiv-by-one using ARSD.
+define <16 x i16> @sdiv_by_one_v16i16(<16 x i16> %a) vscale_range(2,2) {
+; CHECK-LABEL: sdiv_by_one_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0
+; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: ld1h { z2.h }, p0/z, [x8]
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpklo z3.s, z2.h
+; CHECK-NEXT: ext z2.b, z2.b, z2.b, #16
+; CHECK-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z3.s
+; CHECK-NEXT: sdiv z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT: movprfx z1, z0
+; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
+; CHECK-NEXT: ret
+ %res = sdiv <16 x i16> %a, splat(i16 1)
+ ret <16 x i16> %res
+}
+
+; Ensure we don't try to represent sdiv-by-one using ARSD.
+define <vscale x 8 x i16> @sdiv_by_one_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sdiv_by_one_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.h, #1 // =0x1
+; CHECK-NEXT: sunpkhi z2.s, z0.h
+; CHECK-NEXT: sunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sunpkhi z3.s, z1.h
+; CHECK-NEXT: sunpklo z1.s, z1.h
+; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+ %res = sdiv <vscale x 8 x i16> %a, splat(i16 1)
+ ret <vscale x 8 x i16> %res
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 139ecaf..67197b3fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -231,6 +231,274 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
}
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv6f16(<vscale x 6 x half> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv6f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave3.nxv6f16(<vscale x 6 x half> %vec)
+ ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %retval
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv12f16(<vscale x 12 x half> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv12f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave3.nxv12f16(<vscale x 12 x half> %vec)
+ ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %retval
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv24f16(<vscale x 24 x half> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv24f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave3.nxv24f16(<vscale x 24 x half> %vec)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %retval
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv6f32(<vscale x 6 x float> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv6f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave3.nxv6f32(<vscale x 6 x float> %vec)
+ ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %retval
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv12f32(<vscale x 12 x float> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv12f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave3.nxv12f32(<vscale x 12 x float> %vec)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %retval
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv6f64(<vscale x 6 x double> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv6f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave3.nxv6f64(<vscale x 6 x double> %vec)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %retval
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv6bf16(<vscale x 6 x bfloat> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv6bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave3.nxv6bf16(<vscale x 6 x bfloat> %vec)
+ ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv12bf16(<vscale x 12 x bfloat> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv12bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave3.nxv12bf16(<vscale x 12 x bfloat> %vec)
+ ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv24bf16(<vscale x 24 x bfloat> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv24bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave3.nxv24bf16(<vscale x 24 x bfloat> %vec)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %retval
+}
+
+; Integers
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv48i8(<vscale x 48 x i8> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv48i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave3.nxv48i8(<vscale x 48 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv24i16(<vscale x 24 x i16> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv24i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave3.nxv24i16(<vscale x 24 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv12i32(<vscale x 12 x i32> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv12i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv6i64(<vscale x 6 x i64> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv6i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) {
; SVE-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
; SVE: // %bb.0:
@@ -599,31 +867,3 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
}
-
-; Floating declarations
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-
-; Integer declarations
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-
-; Predicated declarations
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
-declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
-declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
-
-; Illegal size type
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
-
-declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index c7fb2db..49f185c 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -221,6 +221,318 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale
ret <vscale x 4 x i64> %retval
}
+define <vscale x 6 x half> @interleave3_nxv6f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1, <vscale x 2 x half> %vec2) {
+; CHECK-LABEL: interleave3_nxv6f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x half> @llvm.vector.interleave3.nxv6f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1, <vscale x 2 x half> %vec2)
+ ret <vscale x 6 x half> %retval
+}
+
+define <vscale x 12 x half> @interleave3_nxv12f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1, <vscale x 4 x half> %vec2) {
+; CHECK-LABEL: interleave3_nxv12f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: addpl x8, sp, #4
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z1, [sp]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl]
+; CHECK-NEXT: str z0, [sp, #3, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp, #3, mul vl]
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x half> @llvm.vector.interleave3.nxv12f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1, <vscale x 4 x half> %vec2)
+ ret <vscale x 12 x half> %retval
+}
+
+define <vscale x 24 x half> @interleave3_nxv24f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1, <vscale x 8 x half> %vec2) {
+; CHECK-LABEL: interleave3_nxv24f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 24 x half> @llvm.vector.interleave3.nxv24f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1, <vscale x 8 x half> %vec2)
+ ret <vscale x 24 x half> %retval
+}
+
+define <vscale x 6 x float> @interleave3_nxv6f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1, <vscale x 2 x float> %vec2) {
+; CHECK-LABEL: interleave3_nxv6f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x8, sp, #4
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z1, [sp]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: st1w { z2.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT: str z0, [sp, #3, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp, #3, mul vl]
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x float> @llvm.vector.interleave3.nxv6f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1, <vscale x 2 x float> %vec2)
+ ret <vscale x 6 x float> %retval
+}
+
+define <vscale x 12 x float> @interleave3_nxv12f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2) {
+; CHECK-LABEL: interleave3_nxv12f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x float> @llvm.vector.interleave3.nxv12f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2)
+ ret <vscale x 12 x float> %retval
+}
+
+define <vscale x 6 x double> @interleave3_nxv6f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1, <vscale x 2 x double> %vec2) {
+; CHECK-LABEL: interleave3_nxv6f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x double>@llvm.vector.interleave3.nxv6f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1, <vscale x 2 x double> %vec2)
+ ret <vscale x 6 x double> %retval
+}
+
+define <vscale x 6 x bfloat> @interleave3_nxv6bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1, <vscale x 2 x bfloat> %vec2) {
+; CHECK-LABEL: interleave3_nxv6bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x bfloat> @llvm.vector.interleave3.nxv6bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1, <vscale x 2 x bfloat> %vec2)
+ ret <vscale x 6 x bfloat> %retval
+}
+
+define <vscale x 12 x bfloat> @interleave3_nxv12bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1, <vscale x 4 x bfloat> %vec2) {
+; CHECK-LABEL: interleave3_nxv12bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: addpl x8, sp, #4
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z1, [sp]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl]
+; CHECK-NEXT: str z0, [sp, #3, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp, #3, mul vl]
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x bfloat> @llvm.vector.interleave3.nxv12bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1, <vscale x 4 x bfloat> %vec2)
+ ret <vscale x 12 x bfloat> %retval
+}
+
+define <vscale x 24 x bfloat> @interleave3_nxv24bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1, <vscale x 8 x bfloat> %vec2) {
+; CHECK-LABEL: interleave3_nxv24bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 24 x bfloat> @llvm.vector.interleave3.nxv24bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1, <vscale x 8 x bfloat> %vec2)
+ ret <vscale x 24 x bfloat> %retval
+}
+
+; Integers
+
+define <vscale x 48 x i8> @interleave3_nxv48i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2) {
+; CHECK-LABEL: interleave3_nxv48i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3b { z0.b - z2.b }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 48 x i8> @llvm.vector.interleave3.nxv48i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2)
+ ret <vscale x 48 x i8> %retval
+}
+
+define <vscale x 24 x i16> @interleave3_nxv24i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2) {
+; CHECK-LABEL: interleave3_nxv24i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 24 x i16> @llvm.vector.interleave3.nxv24i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2)
+ ret <vscale x 24 x i16> %retval
+}
+
+define <vscale x 12 x i32> @interleave3_nxv12i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2) {
+; CHECK-LABEL: interleave3_nxv12i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
+ ret <vscale x 12 x i32> %retval
+}
+
+define <vscale x 6 x i64> @interleave3_nxv6i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2) {
+; CHECK-LABEL: interleave3_nxv6i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2)
+ ret <vscale x 6 x i64> %retval
+}
+
define <vscale x 64 x i8> @interleave4_nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3) {
; SVE-LABEL: interleave4_nxv16i8:
; SVE: // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index be07978..8e0328e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -38,17 +38,11 @@ define half @add_v2HalfH(<2 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v2HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0]
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: ret
;
@@ -88,19 +82,13 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v3HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
@@ -152,17 +140,11 @@ define half @add_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI3_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI3_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -250,16 +232,10 @@ define half @add_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI4_0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI4_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -448,16 +424,10 @@ define half @add_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI7_0
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
-; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI7_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
-; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: fadd s2, s3, s2
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index c10d6e9..716401e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -52,17 +52,11 @@ define half @mul_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0]
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -144,16 +138,10 @@ define half @mul_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -321,16 +309,10 @@ define half @mul_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI5_0
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
-; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI5_0]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
-; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: fmul s2, s3, s2
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1aee6ab..1b879a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -403,40 +403,38 @@ define half @v_neg_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -460,40 +458,38 @@ define half @v_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -517,40 +513,38 @@ define half @v_rcp_f16_arcp(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -575,9 +569,7 @@ define half @v_rcp_f16_arcp_afn(half %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -600,40 +592,38 @@ define half @v_rcp_f16_ulp25(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -1454,70 +1444,67 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1526,30 +1513,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1561,26 +1545,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1594,30 +1575,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1628,26 +1606,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1660,30 +1636,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1696,24 +1669,21 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1726,27 +1696,25 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1757,70 +1725,67 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1829,30 +1794,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1864,26 +1826,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1897,30 +1856,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1931,26 +1887,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, -v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1963,30 +1917,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -1999,24 +1950,21 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2029,27 +1977,25 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -2064,33 +2010,32 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2101,39 +2046,37 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2143,30 +2086,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2179,26 +2119,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2213,30 +2150,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2248,26 +2182,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -2279,32 +2211,29 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2316,26 +2245,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2346,30 +2272,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, 1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2386,33 +2312,32 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2423,39 +2348,37 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2465,30 +2388,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2501,26 +2421,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2535,30 +2452,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2570,26 +2484,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, -v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -2601,32 +2513,29 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2638,26 +2547,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2668,30 +2574,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2704,70 +2610,67 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2814,11 +2717,8 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2864,70 +2764,67 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2936,30 +2833,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2971,26 +2865,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3004,30 +2895,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3038,26 +2926,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3070,30 +2956,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3106,24 +2989,21 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3136,27 +3016,25 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -4033,40 +3911,38 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4099,40 +3975,38 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4166,21 +4040,20 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rsq_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
@@ -4188,24 +4061,23 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-FLUSH-LABEL: s_rsq_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
@@ -4241,36 +4113,35 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[0:1], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -4283,42 +4154,40 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4330,31 +4199,28 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4369,25 +4235,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4402,25 +4265,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4434,25 +4294,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4466,25 +4324,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4498,25 +4354,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4530,29 +4383,27 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -4568,21 +4419,20 @@ define half @v_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4590,24 +4440,23 @@ define half @v_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4632,21 +4481,20 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4654,24 +4502,23 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4706,21 +4553,20 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4728,24 +4574,23 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4785,21 +4630,20 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4807,24 +4651,23 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4859,21 +4702,20 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4881,24 +4723,23 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4933,21 +4774,20 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4955,24 +4795,23 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5007,21 +4846,20 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5029,24 +4867,23 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5081,21 +4918,20 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5103,24 +4939,23 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5156,21 +4991,20 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5178,24 +5012,23 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5220,21 +5053,20 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5242,24 +5074,23 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5294,12 +5125,10 @@ define half @v_rsq_f16_afn(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5324,12 +5153,10 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5365,36 +5192,35 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5404,42 +5230,40 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5448,31 +5272,28 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5486,25 +5307,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5518,25 +5336,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, 1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5549,25 +5364,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5580,25 +5393,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5611,25 +5422,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5642,7 +5450,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5650,22 +5458,20 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -5679,36 +5485,35 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5718,42 +5523,40 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5762,31 +5565,28 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5800,25 +5600,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5832,25 +5629,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5863,25 +5657,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5894,25 +5686,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5925,25 +5715,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5956,7 +5743,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5964,22 +5751,20 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 302b239..549af87 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -88,11 +88,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: v_or_b32_e32 v1, s4, v0
; CI-NEXT: .LBB0_8: ; %Flow19
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
; CI-NEXT: s_and_b32 s2, 1, s2
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
@@ -1197,16 +1196,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v1, s4, v1
; CI-NEXT: .LBB9_16: ; %Flow54
; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
-; CI-NEXT: v_cvt_f32_f16_e32 v3, 0
; CI-NEXT: s_and_b32 s0, s0, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00
; CI-NEXT: s_cselect_b32 s4, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v2, 0x7e00
; CI-NEXT: s_and_b32 s3, 1, s4
@@ -1730,26 +1728,25 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v3, s1, v3
; CI-NEXT: .LBB10_32: ; %Flow124
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v5, 0
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00
; CI-NEXT: s_cselect_b32 s11, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
; CI-NEXT: s_and_b32 s2, s6, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s6, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s3
; CI-NEXT: s_and_b32 s4, s5, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
; CI-NEXT: s_cselect_b32 s12, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s10
; CI-NEXT: s_and_b32 s7, s7, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00
; CI-NEXT: s_cselect_b32 s7, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v4, 0x7e00
; CI-NEXT: s_and_b32 s10, 1, s11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
index eee553e..a7023d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smax.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: smax_s32_ss
@@ -188,8 +187,7 @@ body: |
; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
+ ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32)
; CHECK-NEXT: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK-NEXT: [[SMAX1:%[0-9]+]]:sgpr(s32) = G_SMAX [[ASHR]], [[ASHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMAX]](s32), [[SMAX1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
index ef60aa8..9dd5f45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-smin.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: smin_s32_ss
@@ -191,8 +190,7 @@ body: |
; CHECK-NEXT: [[ASHR:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST]], [[C]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:sgpr(s32) = G_SEXT_INREG [[BITCAST1]], 16
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C1]](s32)
+ ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:sgpr(s32) = G_ASHR [[BITCAST1]], [[C]](s32)
; CHECK-NEXT: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SEXT_INREG]], [[SEXT_INREG1]]
; CHECK-NEXT: [[SMIN1:%[0-9]+]]:sgpr(s32) = G_SMIN [[ASHR]], [[ASHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[SMIN]](s32), [[SMIN1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
index 36a38aa..59d7dce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umax.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: umax_s32_ss
@@ -186,15 +185,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]]
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
; CHECK-NEXT: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[AND]], [[AND1]]
; CHECK-NEXT: [[UMAX1:%[0-9]+]]:sgpr(s32) = G_UMAX [[LSHR]], [[LSHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMAX]](s32), [[UMAX1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
index bb232b5e..fdb05f6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-umin.mir
@@ -1,6 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-fast -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=regbankselect -verify-machineinstrs -regbankselect-greedy -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' -o - %s | FileCheck %s
---
name: umin_s32_ss
@@ -190,15 +189,13 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr1
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST]], [[C]]
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:sgpr(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C2]](s32)
- ; CHECK-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C3]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:sgpr(s32) = G_AND [[BITCAST1]], [[C]]
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:sgpr(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
; CHECK-NEXT: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[AND]], [[AND1]]
; CHECK-NEXT: [[UMIN1:%[0-9]+]]:sgpr(s32) = G_UMIN [[LSHR]], [[LSHR1]]
; CHECK-NEXT: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[UMIN]](s32), [[UMIN1]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 9ffc565..4f2c454 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2537,202 +2537,195 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
-; GISEL-NEXT: v_trunc_f32_e32 v7, v4
-; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_trunc_f32_e32 v5, v4
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v12, v7, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v3
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, v7, v4
+; GISEL-NEXT: v_mul_hi_u32 v0, v7, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v3
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v3
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8]
-; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v3
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
+; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v8
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v10, v7
; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v2
+; GISEL-NEXT: v_trunc_f32_e32 v8, v6
+; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3
; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc
-; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v8
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_mov_b32_e32 v2, v7
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v2, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v2, v15, v6
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v7
+; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5
+; GISEL-NEXT: v_mul_lo_u32 v5, v15, v7
+; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_mul_hi_u32 v7, v15, v7
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2
-; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
-; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
-; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v2
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v15, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v9, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
+; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v6
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v12, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v12, v6, vcc
; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v11, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v2
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GISEL-NEXT: v_mul_hi_u32 v10, 0, v7
; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v12, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6
; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
@@ -2743,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2755,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_24bit:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
index ac1e11b..dfa613c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define i32 @test_min_max_ValK0_K1_i32(i32 %a) {
; GFX89-LABEL: test_min_max_ValK0_K1_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 82279e6..40b5db0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3035,203 +3035,193 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0
-; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
-; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v7
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3
-; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
+; GISEL-NEXT: v_trunc_f32_e32 v5, v4
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v3, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
-; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8
-; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v5
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_mov_b32_e32 v3, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
-; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v3
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v9, v8, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
+; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v0, v11, v3
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_mul_lo_u32 v9, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v11, v3, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, 0, v0
+; GISEL-NEXT: v_mul_lo_u32 v5, v10, v8
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1]
-; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc
-; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v9, v4
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v8
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v8
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v6, 0, v8
+; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v7
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
-; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
+; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v7
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v11, v7
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v11
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v0
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v10, v4
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], 0, v5
+; GISEL-NEXT: v_mul_lo_u32 v4, v11, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v12, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v12, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v11, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v7
; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0
-; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc
-; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4
-; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4
+; GISEL-NEXT: v_mul_hi_u32 v5, v12, v8
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v4
+; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v6, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v1
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v4
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v13, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v8, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, 0, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v2, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v2, v4
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v0, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v4, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc
; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v7, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 4de1078..ded985e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2053,90 +2053,82 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_lo_u32 v12, v6, v2
; GISEL-NEXT: v_mul_lo_u32 v13, 0, v2
; GISEL-NEXT: v_mul_hi_u32 v14, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2
-; GISEL-NEXT: v_mul_lo_u32 v15, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v15, 0, v2
+; GISEL-NEXT: v_mul_lo_u32 v2, v0, v5
; GISEL-NEXT: v_mul_lo_u32 v16, 0, v5
; GISEL-NEXT: v_mul_hi_u32 v17, v0, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
+; GISEL-NEXT: v_mul_lo_u32 v12, v3, v15
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v3, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v14, 0, v7
-; GISEL-NEXT: v_mul_hi_u32 v15, v1, v7
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v2, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v3, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4
+; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v15, vcc
+; GISEL-NEXT: v_mul_lo_u32 v14, v1, v2
+; GISEL-NEXT: v_mul_lo_u32 v16, 0, v2
+; GISEL-NEXT: v_mul_hi_u32 v17, v1, v2
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2
; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v17
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v17
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v8, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v11
-; GISEL-NEXT: v_subb_u32_e64 v11, s[6:7], 0, v12, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v14
+; GISEL-NEXT: v_subb_u32_e64 v14, s[6:7], 0, v9, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v11
-; GISEL-NEXT: v_add_i32_e64 v9, s[10:11], 1, v18
-; GISEL-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v19, s[10:11]
-; GISEL-NEXT: v_sub_i32_e64 v2, s[10:11], 0, v2
-; GISEL-NEXT: v_sub_i32_e64 v12, s[10:11], 0, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v13, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v15, s[8:9]
-; GISEL-NEXT: v_subbrev_u32_e64 v12, vcc, 0, v12, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v14
+; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], 1, v18
+; GISEL-NEXT: v_addc_u32_e64 v14, s[10:11], 0, v19, s[10:11]
+; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], 0, v8
+; GISEL-NEXT: v_sub_i32_e64 v9, s[10:11], 0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v10, s[6:7]
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v17, s[8:9]
+; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5]
; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v12, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v14, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v18, v9, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v7, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v16, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v11, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v14, s[4:5]
+; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
index 2b54123..f5068f5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
define i32 @test_min_max_ValK0_K1_u32(i32 %a) {
; GFX89-LABEL: test_min_max_ValK0_K1_u32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index a41ec8e..be5543b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2058,42 +2058,34 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v14
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4
; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, 0, v7
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v7
+; GISEL-NEXT: v_mul_lo_u32 v11, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2
-; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v9
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8
; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc
; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v11
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v10
; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
index c94b333..1f36902 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis-heuristics.ll
@@ -726,16 +726,16 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE815:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE4]], i64 4
; CHECK-NEXT: br label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ]
-; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE01]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE22]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE43]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE64]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE85]], [[THEN1]] ], [ 0.000000e+00, [[FINALLY]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE011]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE212]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE413]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE614]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[FINALLY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE815]], [[THEN1]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[FINALLY]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE016:%.*]] = insertelement <5 x double> poison, double [[TMP10]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE117:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE016]], double [[TMP11]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE218:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE117]], double [[TMP12]], i64 2
@@ -746,8 +746,8 @@ define amdgpu_kernel void @used_by_unbreakable_and_breakable_phi(<5 x double> %i
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE28:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE17]], double [[TMP7]], i64 2
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE39:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE28]], double [[TMP8]], i64 3
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE410:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE39]], double [[TMP9]], i64 4
-; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1
; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE420]], ptr [[OUT]], align 1
+; CHECK-NEXT: store <5 x double> [[LARGEPHI_INSERTSLICE410]], ptr [[OUT]], align 1
; CHECK-NEXT: ret void
;
entry:
@@ -1187,11 +1187,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE960:%.*]] = extractelement <5 x double> [[IN]], i64 4
; CHECK-NEXT: br i1 [[COND]], label [[END:%.*]], label [[COND5_END]]
; CHECK: cond5.end:
-; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP25:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE041]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP26:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE242]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP27:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE443]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP28:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE644]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP29:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE845]], [[COND4_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE046:%.*]] = insertelement <5 x double> poison, double [[TMP25]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE147:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE046]], double [[TMP26]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE248:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE147]], double [[TMP27]], i64 2
@@ -1204,11 +1204,11 @@ define amdgpu_kernel void @test_breakable_chain_5_out_of_7(<5 x double> %in, ptr
; CHECK-NEXT: [[LARGEPHI_EXTRACTSLICE859:%.*]] = extractelement <5 x double> [[LARGEPHI_INSERTSLICE450]], i64 4
; CHECK-NEXT: br label [[END]]
; CHECK: end:
-; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE152]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE354]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE556]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE758]], [[COND5_TRUE]] ]
-; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE960]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP30:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE051]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE1]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP31:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE253]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE3]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP32:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE455]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE5]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP33:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE657]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE7]], [[COND5_TRUE]] ]
+; CHECK-NEXT: [[TMP34:%.*]] = phi double [ [[LARGEPHI_EXTRACTSLICE859]], [[COND5_END]] ], [ [[LARGEPHI_EXTRACTSLICE9]], [[COND5_TRUE]] ]
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE061:%.*]] = insertelement <5 x double> poison, double [[TMP30]], i64 0
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE162:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE061]], double [[TMP31]], i64 1
; CHECK-NEXT: [[LARGEPHI_INSERTSLICE263:%.*]] = insertelement <5 x double> [[LARGEPHI_INSERTSLICE162]], double [[TMP32]], i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index a4f9ce3..7ff86ac 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -2160,7 +2160,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; IEEE-GOODFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0
; IEEE-GOODFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1
; IEEE-GOODFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]]
+; IEEE-GOODFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0
+; IEEE-GOODFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1
+; IEEE-GOODFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]])
+; IEEE-GOODFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]])
+; IEEE-GOODFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]])
+; IEEE-GOODFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]])
+; IEEE-GOODFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]])
+; IEEE-GOODFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0
+; IEEE-GOODFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]])
+; IEEE-GOODFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0
+; IEEE-GOODFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0
; IEEE-GOODFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -2231,7 +2246,22 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; IEEE-BADFREXP-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP27]], i64 0
; IEEE-BADFREXP-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1
; IEEE-BADFREXP-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]]
+; IEEE-BADFREXP-NEXT: [[TMP56:%.*]] = extractelement <2 x float> [[X]], i64 0
+; IEEE-BADFREXP-NEXT: [[TMP57:%.*]] = extractelement <2 x float> [[X]], i64 1
+; IEEE-BADFREXP-NEXT: [[TMP58:%.*]] = fcmp olt float [[TMP56]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP59:%.*]] = select i1 [[TMP58]], i32 32, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP60:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP56]], i32 [[TMP59]])
+; IEEE-BADFREXP-NEXT: [[TMP61:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP60]])
+; IEEE-BADFREXP-NEXT: [[TMP62:%.*]] = select i1 [[TMP58]], i32 -16, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP63:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP61]], i32 [[TMP62]])
+; IEEE-BADFREXP-NEXT: [[TMP64:%.*]] = fcmp olt float [[TMP57]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP65:%.*]] = select i1 [[TMP64]], i32 32, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP66:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP57]], i32 [[TMP65]])
+; IEEE-BADFREXP-NEXT: [[TMP67:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP66]])
+; IEEE-BADFREXP-NEXT: [[TMP68:%.*]] = select i1 [[TMP64]], i32 -16, i32 0
+; IEEE-BADFREXP-NEXT: [[TMP69:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP67]], i32 [[TMP68]])
+; IEEE-BADFREXP-NEXT: [[TMP70:%.*]] = insertelement <2 x float> poison, float [[TMP63]], i64 0
+; IEEE-BADFREXP-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP70]], float [[TMP69]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0
; IEEE-BADFREXP-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -2258,7 +2288,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; DAZ-NEXT: [[SQRT_X_NO_MD:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
; DAZ-NEXT: [[NO_MD:%.*]] = fdiv contract <2 x float> splat (float 1.000000e+00), [[SQRT_X_NO_MD]]
; DAZ-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META2:![0-9]+]]
+; DAZ-NEXT: [[TMP39:%.*]] = extractelement <2 x float> [[X]], i64 0
+; DAZ-NEXT: [[TMP40:%.*]] = extractelement <2 x float> [[X]], i64 1
+; DAZ-NEXT: [[TMP41:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP39]])
+; DAZ-NEXT: [[TMP42:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP40]])
+; DAZ-NEXT: [[TMP43:%.*]] = insertelement <2 x float> poison, float [[TMP41]], i64 0
+; DAZ-NEXT: [[SQRT_MD_1ULP:%.*]] = insertelement <2 x float> [[TMP43]], float [[TMP42]], i64 1
; DAZ-NEXT: [[TMP1:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 0
; DAZ-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP]], i64 1
; DAZ-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -2276,7 +2311,9 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; DAZ-NEXT: [[SQRT_MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP11]], i64 1
; DAZ-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 0
; DAZ-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[SQRT_MD_1ULP_UNDEF]], i64 1
-; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
+; DAZ-NEXT: [[TMP44:%.*]] = extractelement <2 x float> [[X]], i64 0
+; DAZ-NEXT: [[TMP45:%.*]] = extractelement <2 x float> [[X]], i64 1
+; DAZ-NEXT: [[TMP15:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP44]])
; DAZ-NEXT: [[TMP16:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]])
; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP16]], 0
; DAZ-NEXT: [[TMP18:%.*]] = extractvalue { float, i32 } [[TMP16]], 1
@@ -2290,7 +2327,12 @@ define amdgpu_kernel void @rsq_f32_vector_fpmath(ptr addrspace(1) %out, <2 x flo
; DAZ-NEXT: [[TMP26:%.*]] = insertelement <2 x float> poison, float [[TMP15]], i64 0
; DAZ-NEXT: [[MD_1ULP_UNDEF:%.*]] = insertelement <2 x float> [[TMP26]], float [[TMP25]], i64 1
; DAZ-NEXT: store volatile <2 x float> [[MD_1ULP_UNDEF]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath [[META3:![0-9]+]]
+; DAZ-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[X]], i64 0
+; DAZ-NEXT: [[TMP35:%.*]] = extractelement <2 x float> [[X]], i64 1
+; DAZ-NEXT: [[TMP36:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP34]])
+; DAZ-NEXT: [[TMP37:%.*]] = call float @llvm.amdgcn.sqrt.f32(float [[TMP35]])
+; DAZ-NEXT: [[TMP38:%.*]] = insertelement <2 x float> poison, float [[TMP36]], i64 0
+; DAZ-NEXT: [[SQRT_X_3ULP:%.*]] = insertelement <2 x float> [[TMP38]], float [[TMP37]], i64 1
; DAZ-NEXT: [[TMP27:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 0
; DAZ-NEXT: [[TMP28:%.*]] = extractelement <2 x float> [[SQRT_X_3ULP]], i64 1
; DAZ-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[X]], i64 0
@@ -3200,9 +3242,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator(<4 x float> %arg) {
; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP12]])
-; DAZ-NEXT: [[TMP17:%.*]] = fneg contract float [[TMP13]]
-; DAZ-NEXT: [[TMP18:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP42:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; DAZ-NEXT: [[TMP43:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; DAZ-NEXT: [[TMP44:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; DAZ-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP42]])
+; DAZ-NEXT: [[TMP45:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP45]]
; DAZ-NEXT: [[TMP19:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP14]])
; DAZ-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP19]], 0
; DAZ-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP19]], 1
@@ -3675,9 +3721,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_arcp(<4 x float> %ar
; DAZ-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; DAZ-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; DAZ-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP12]])
-; DAZ-NEXT: [[TMP17:%.*]] = fneg arcp contract float [[TMP13]]
-; DAZ-NEXT: [[TMP18:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; DAZ-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; DAZ-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; DAZ-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; DAZ-NEXT: [[TMP16:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP26]])
+; DAZ-NEXT: [[TMP29:%.*]] = call arcp contract float @llvm.amdgcn.rsq.f32(float [[TMP17]])
+; DAZ-NEXT: [[TMP18:%.*]] = fneg arcp contract float [[TMP29]]
; DAZ-NEXT: [[TMP19:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP14]])
; DAZ-NEXT: [[TMP20:%.*]] = fmul arcp contract float 4.000000e+00, [[TMP19]]
; DAZ-NEXT: [[TMP21:%.*]] = call arcp contract float @llvm.amdgcn.rcp.f32(float [[TMP15]])
@@ -3850,19 +3900,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2
; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3
-; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]])
-; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
-; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-GOODFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-GOODFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP50:%.*]] = extractvalue { float, i32 } [[TMP48]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP50]]
-; IEEE-GOODFREXP-NEXT: [[TMP51:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP51]], i32 [[TMP22]])
+; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00)
+; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00)
+; IEEE-GOODFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]]
; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = extractvalue { float, i32 } [[TMP29]], 1
@@ -3903,19 +3943,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2
; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3
-; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP8]])
-; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP8]])
-; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
-; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = fneg contract float [[TMP9]]
-; IEEE-BADFREXP-NEXT: [[TMP48:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP18]])
-; IEEE-BADFREXP-NEXT: [[TMP49:%.*]] = extractvalue { float, i32 } [[TMP48]], 0
-; IEEE-BADFREXP-NEXT: [[TMP21:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP18]])
-; IEEE-BADFREXP-NEXT: [[TMP22:%.*]] = sub i32 0, [[TMP21]]
-; IEEE-BADFREXP-NEXT: [[TMP50:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP49]])
-; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP50]], i32 [[TMP22]])
+; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00)
+; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00)
+; IEEE-BADFREXP-NEXT: [[TMP24:%.*]] = fneg contract float [[TMP13]]
; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = extractvalue { float, i32 } [[TMP29]], 0
; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP10]])
@@ -3956,9 +3986,9 @@ define <4 x float> @rsq_f32_vector_const_denom(ptr addrspace(1) %out, <2 x float
; DAZ-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SQRT]], i64 1
; DAZ-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SQRT]], i64 2
; DAZ-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[SQRT]], i64 3
-; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP8]])
-; DAZ-NEXT: [[TMP13:%.*]] = fneg contract float [[TMP9]]
-; DAZ-NEXT: [[TMP14:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
+; DAZ-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 4.000000e+00)
+; DAZ-NEXT: [[TMP13:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float 2.000000e+00)
+; DAZ-NEXT: [[TMP14:%.*]] = fneg contract float [[TMP13]]
; DAZ-NEXT: [[TMP15:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP10]])
; DAZ-NEXT: [[TMP16:%.*]] = extractvalue { float, i32 } [[TMP15]], 0
; DAZ-NEXT: [[TMP17:%.*]] = extractvalue { float, i32 } [[TMP15]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll
new file mode 100644
index 0000000..f466513
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; Make sure the coalescer doesn't introduce any uses of
+; vreg_1024. None are available to allocate with the register budget
+; of this function.
+
+define void @no_introduce_vreg_1024() #0 {
+; CHECK-LABEL: no_introduce_vreg_1024:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def v[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_mov_b32_e32 v9, v0
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use v[0:15]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %tuple = call <8 x i32> asm sideeffect "; def $0","=v"()
+ %sub0 = extractelement <8 x i32> %tuple, i32 0
+ %insert = insertelement <16 x i32> poison, i32 %sub0, i32 9
+ call void asm sideeffect "; use $0","v"(<16 x i32> %insert)
+ ret void
+}
+
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir
new file mode 100644
index 0000000..1f414eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-avoid-coalesce-class-with-no-registers.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# The register budget for this function does not permit using 1024-bit
+# registers. The coalescer should not introduce a 1024-bit virtual
+# register which will fail to allocate.
+
+--- |
+ define void @no_introduce_vreg_1024() #0 {
+ ret void
+ }
+
+ attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+...
+---
+name: no_introduce_vreg_1024
+tracksRegLiveness: true
+machineFunctionInfo:
+ occupancy: 10
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+
+ ; CHECK-LABEL: name: no_introduce_vreg_1024
+ ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub9:vreg_512 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: SI_RETURN implicit [[COPY1]]
+ %0:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ undef %1.sub9:vreg_512 = COPY %0.sub0
+ SI_RETURN implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 3983655..38239c5 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1634,29 +1634,18 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; IR-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
; IR-IEEE-SDAG: ; %bb.0:
; IR-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
+; IR-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000
; IR-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v1, v0
-; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v2, v1, v0
-; IR-IEEE-SDAG-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; IR-IEEE-SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v1, -v3, v1, v0
-; IR-IEEE-SDAG-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-IEEE-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-IEEE-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
+; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; IR-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc
+; IR-IEEE-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; IR-IEEE-SDAG-NEXT: v_rcp_f32_e32 v2, v1
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v3, v2, v2
; IR-IEEE-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2
; IR-IEEE-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2
; IR-IEEE-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3
; IR-IEEE-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4
@@ -1668,24 +1657,14 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; IR-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
; IR-IEEE-GISEL: ; %bb.0:
; IR-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000
-; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
+; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v1, v0
-; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; IR-IEEE-GISEL-NEXT: v_fma_f32 v3, -v2, v1, v0
-; IR-IEEE-GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 1, v1
-; IR-IEEE-GISEL-NEXT: v_fma_f32 v5, -v4, v1, v0
-; IR-IEEE-GISEL-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; IR-IEEE-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v5
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5]
-; IR-IEEE-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-IEEE-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; IR-IEEE-GISEL-NEXT: v_lshlrev_b32_e32 v1, 5, v1
+; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; IR-IEEE-GISEL-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-IEEE-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -16, vcc
+; IR-IEEE-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
; IR-IEEE-GISEL-NEXT: v_rcp_f32_e32 v2, v1
; IR-IEEE-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
@@ -1705,75 +1684,24 @@ define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
-; IR-DAZ-SDAG: ; %bb.0:
-; IR-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-SDAG-NEXT: s_mov_b32 s4, 0xf800000
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; IR-DAZ-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; IR-DAZ-SDAG-NEXT: v_rsq_f32_e32 v1, v0
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v2, v2, v0
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, v4, v1, v2
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-SDAG-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-DAZ-SDAG-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; IR-DAZ-SDAG-NEXT: v_rcp_f32_e32 v2, v1
-; IR-DAZ-SDAG-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v2, v4, v2, v2
-; IR-DAZ-SDAG-NEXT: v_mul_f32_e32 v4, v3, v2
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v5, -v1, v4, v3
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v4, v5, v2, v4
-; IR-DAZ-SDAG-NEXT: v_fma_f32 v1, -v1, v4, v3
-; IR-DAZ-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; IR-DAZ-SDAG-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; IR-DAZ-SDAG-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; IR-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; IR-DAZ-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
-; IR-DAZ-GISEL: ; %bb.0:
-; IR-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v1, 0xf800000
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; IR-DAZ-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
-; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; IR-DAZ-GISEL-NEXT: v_rsq_f32_e32 v1, v0
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v2, v3, v2
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v1, v3, v1
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v3, -v2, v2, v0
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, v3, v1, v2
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; IR-DAZ-GISEL-NEXT: v_mov_b32_e32 v2, 0x260
-; IR-DAZ-GISEL-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; IR-DAZ-GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; IR-DAZ-GISEL-NEXT: v_rcp_f32_e32 v2, v1
-; IR-DAZ-GISEL-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v2, v4, v2, v2
-; IR-DAZ-GISEL-NEXT: v_mul_f32_e32 v4, v3, v2
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v5, -v1, v4, v3
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v4, v5, v2, v4
-; IR-DAZ-GISEL-NEXT: v_fma_f32 v1, -v1, v4, v3
-; IR-DAZ-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; IR-DAZ-GISEL-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; IR-DAZ-GISEL-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; IR-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31]
+; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; IR-DAZ: ; %bb.0:
+; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1
+; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0
%fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0
ret float %fdiv
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9233f80..9e15225 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -7464,18 +7464,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
@@ -7639,27 +7636,24 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0
; SI-GISEL-NEXT: s_mov_b32 s10, 0
; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
-; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5]
-; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7
+; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
+; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -8712,12 +8706,10 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 2.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -8796,17 +8788,15 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 2.0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
+; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index ee11b92..0c1448a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -44,23 +44,23 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_mov_b32_e32 v12, s16
+; GISEL-NEXT: v_mov_b32_e32 v16, s16
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -834,24 +834,24 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1349,24 +1349,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1513,24 +1513,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1677,24 +1677,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1841,24 +1841,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-NEXT: v_mov_b32_e32 v16, s2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
; GISEL-NEXT: v_mov_b32_e32 v0, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
; GISEL-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index af79c91..ac356fa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6011,8 +6011,7 @@ define half @v_exp_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6512,10 +6511,9 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6709,12 +6707,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index a99c199..d12ebe4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6092,8 +6092,7 @@ define half @v_exp10_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6594,10 +6593,9 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6791,12 +6789,11 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 3f66c23..259ee0b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -488,13 +488,11 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -582,15 +580,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; GISEL-CI-NEXT: s_waitcnt vmcnt(0)
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 21e6faf4..ba77552 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -313,13 +313,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -1009,28 +1007,26 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
@@ -1225,25 +1221,23 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
@@ -1441,30 +1435,28 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -1622,16 +1614,14 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
@@ -1790,17 +1780,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 4f73e8e..c90b2c9 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -271,8 +271,7 @@ define half @v_maximumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 0af655df..4bb6538 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -2399,8 +2399,9 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2409,7 +2410,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: s_add_i32 s1, s1, -1
; GFX90A-NEXT: s_cmp_lg_u32 s1, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB9_2
@@ -2468,8 +2469,9 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2478,7 +2480,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: s_add_i32 s1, s1, -1
; GFX942-NEXT: s_cmp_lg_u32 s1, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB9_2
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 558006d..64e8b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -271,8 +271,7 @@ define half @v_minimumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
index 8d5b5e4..b41aa08 100644
--- a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
+++ b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
@@ -510,14 +510,14 @@ body: |
; RPU-NEXT: 0 0 $sgpr0 = S_BUFFER_LOAD_DWORD_IMM $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0
; RPU-NEXT: 0 0
; RPU-NEXT: 0 1 undef %0.sub5:vreg_512 = V_MOV_B32_e32 5, implicit $exec
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: 0 0 S_CMP_GT_U32 $sgpr0, 15, implicit-def $scc
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: 0 0 S_CBRANCH_SCC1 %bb.2, implicit $scc
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: 0 0 S_BRANCH %bb.1
- ; RPU-NEXT: 0 0
- ; RPU-NEXT: Live-out:
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: 0 1 S_CMP_GT_U32 $sgpr0, 15, implicit-def $scc
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: 0 1 S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: 0 1 S_BRANCH %bb.1
+ ; RPU-NEXT: 0 1
+ ; RPU-NEXT: Live-out: %0:0000000000000C00
; RPU-NEXT: Live-thr:
; RPU-NEXT: 0 0
; RPU-NEXT: bb.1:
@@ -571,8 +571,6 @@ body: |
; RPD-NEXT: 0 1 S_BRANCH %bb.1
; RPD-NEXT: 0 1
; RPD-NEXT: Live-out: %0:0000000000000C00
- ; RPD-NEXT: mis LIS:
- ; RPD-NEXT: %0:L0000000000000C00 isn't found in LIS reported set
; RPD-NEXT: Live-thr:
; RPD-NEXT: 0 0
; RPD-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
index f001bf0..b52913f 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll
@@ -20,34 +20,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s7, s4, s3
+; GFX90A-NEXT: s_cselect_b32 s7, s3, s2
; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s7, s5, s7
+; GFX90A-NEXT: s_cselect_b32 s7, s4, s7
; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s7, s6, s7
+; GFX90A-NEXT: s_cselect_b32 s7, s5, s7
; GFX90A-NEXT: s_or_b32 s7, s7, s0
; GFX90A-NEXT: s_cmp_eq_u32 s1, 1
; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s4, s7, s4
+; GFX90A-NEXT: s_cselect_b32 s3, s7, s3
; GFX90A-NEXT: s_cmp_eq_u32 s1, 3
; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0
; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec
-; GFX90A-NEXT: s_cselect_b32 s6, s7, s6
+; GFX90A-NEXT: s_cselect_b32 s5, s7, s5
; GFX90A-NEXT: s_cmp_eq_u32 s1, 2
; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec
-; GFX90A-NEXT: s_cselect_b32 s5, s7, s5
+; GFX90A-NEXT: s_cselect_b32 s4, s7, s4
; GFX90A-NEXT: s_cmp_eq_u32 s1, 0
-; GFX90A-NEXT: s_cselect_b32 s3, s7, s3
+; GFX90A-NEXT: s_cselect_b32 s2, s7, s2
; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX90A-NEXT: s_cselect_b32 s2, 0, s2
+; GFX90A-NEXT: s_cselect_b32 s6, 0, s6
; GFX90A-NEXT: s_mov_b64 vcc, vcc
; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock
@@ -68,34 +68,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX942-NEXT: s_cmp_eq_u32 s1, 1
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s7, s4, s3
+; GFX942-NEXT: s_cselect_b32 s7, s3, s2
; GFX942-NEXT: s_cmp_eq_u32 s1, 2
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s7, s5, s7
+; GFX942-NEXT: s_cselect_b32 s7, s4, s7
; GFX942-NEXT: s_cmp_eq_u32 s1, 3
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s7, s6, s7
+; GFX942-NEXT: s_cselect_b32 s7, s5, s7
; GFX942-NEXT: s_or_b32 s7, s7, s0
; GFX942-NEXT: s_cmp_eq_u32 s1, 1
; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0
; GFX942-NEXT: s_and_b64 s[10:11], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s4, s7, s4
+; GFX942-NEXT: s_cselect_b32 s3, s7, s3
; GFX942-NEXT: s_cmp_eq_u32 s1, 3
; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0
; GFX942-NEXT: s_and_b64 s[12:13], s[10:11], exec
-; GFX942-NEXT: s_cselect_b32 s6, s7, s6
+; GFX942-NEXT: s_cselect_b32 s5, s7, s5
; GFX942-NEXT: s_cmp_eq_u32 s1, 2
; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX942-NEXT: s_and_b64 s[14:15], s[12:13], exec
-; GFX942-NEXT: s_cselect_b32 s5, s7, s5
+; GFX942-NEXT: s_cselect_b32 s4, s7, s4
; GFX942-NEXT: s_cmp_eq_u32 s1, 0
-; GFX942-NEXT: s_cselect_b32 s3, s7, s3
+; GFX942-NEXT: s_cselect_b32 s2, s7, s2
; GFX942-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9]
; GFX942-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec
-; GFX942-NEXT: s_cselect_b32 s2, 0, s2
+; GFX942-NEXT: s_cselect_b32 s6, 0, s6
; GFX942-NEXT: s_mov_b64 vcc, vcc
; GFX942-NEXT: s_cbranch_vccnz .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %DummyReturnBlock
@@ -117,34 +117,34 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
; GFX1030-NEXT: s_cselect_b32 s7, -1, 0
; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s7, s4, s3
+; GFX1030-NEXT: s_cselect_b32 s7, s3, s2
; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s7, s5, s7
+; GFX1030-NEXT: s_cselect_b32 s7, s4, s7
; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s7, s6, s7
+; GFX1030-NEXT: s_cselect_b32 s7, s5, s7
; GFX1030-NEXT: s_or_b32 s7, s7, s0
; GFX1030-NEXT: s_cmp_eq_u32 s1, 1
; GFX1030-NEXT: s_cselect_b32 s8, -1, 0
; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
+; GFX1030-NEXT: s_cselect_b32 s3, s7, s3
; GFX1030-NEXT: s_cmp_eq_u32 s1, 3
; GFX1030-NEXT: s_cselect_b32 s9, -1, 0
; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s6, s7, s6
+; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
; GFX1030-NEXT: s_cmp_eq_u32 s1, 2
; GFX1030-NEXT: s_cselect_b32 s10, -1, 0
; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s5, s7, s5
+; GFX1030-NEXT: s_cselect_b32 s4, s7, s4
; GFX1030-NEXT: s_cmp_eq_u32 s1, 0
-; GFX1030-NEXT: s_cselect_b32 s3, s7, s3
+; GFX1030-NEXT: s_cselect_b32 s2, s7, s2
; GFX1030-NEXT: s_or_b32 s7, s10, s8
; GFX1030-NEXT: s_or_b32 s7, s9, s7
; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1030-NEXT: s_cselect_b32 s2, 0, s2
+; GFX1030-NEXT: s_cselect_b32 s6, 0, s6
; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1
; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX1030-NEXT: s_endpgm
@@ -166,38 +166,38 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) {
; GFX1100-NEXT: s_cselect_b32 s7, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s7, s4, s3
+; GFX1100-NEXT: s_cselect_b32 s7, s3, s2
; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s7, s5, s7
+; GFX1100-NEXT: s_cselect_b32 s7, s4, s7
; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s7, s6, s7
+; GFX1100-NEXT: s_cselect_b32 s7, s5, s7
; GFX1100-NEXT: s_or_b32 s7, s7, s0
; GFX1100-NEXT: s_cmp_eq_u32 s1, 1
; GFX1100-NEXT: s_cselect_b32 s8, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s4, s7, s4
+; GFX1100-NEXT: s_cselect_b32 s3, s7, s3
; GFX1100-NEXT: s_cmp_eq_u32 s1, 3
; GFX1100-NEXT: s_cselect_b32 s9, -1, 0
; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s6, s7, s6
+; GFX1100-NEXT: s_cselect_b32 s5, s7, s5
; GFX1100-NEXT: s_cmp_eq_u32 s1, 2
; GFX1100-NEXT: s_cselect_b32 s10, -1, 0
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s5, s7, s5
+; GFX1100-NEXT: s_cselect_b32 s4, s7, s4
; GFX1100-NEXT: s_cmp_eq_u32 s1, 0
-; GFX1100-NEXT: s_cselect_b32 s3, s7, s3
+; GFX1100-NEXT: s_cselect_b32 s2, s7, s2
; GFX1100-NEXT: s_or_b32 s7, s10, s8
; GFX1100-NEXT: s_or_b32 s7, s9, s7
; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo
-; GFX1100-NEXT: s_cselect_b32 s2, 0, s2
+; GFX1100-NEXT: s_cselect_b32 s6, 0, s6
; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1
; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock
; GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 6b5bae0..c9b94e0 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -6,12 +6,12 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-VI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=+real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -mattr=-real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11,GISEL-GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -global-isel -new-reg-bank-select < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s
; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
index b5d9d00..8d0e003 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_smax_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i8:
@@ -1632,6 +1632,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1678,7 +1679,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_smax_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1692,7 +1693,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_smax_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1713,7 +1714,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_smax_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1747,7 +1748,7 @@ define i16 @test_vector_reduce_smax_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1900,6 +1901,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1961,7 +1963,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1977,7 +1979,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2003,7 +2005,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2041,7 +2043,7 @@ define i16 @test_vector_reduce_smax_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2049,6 +2051,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2139,7 +2142,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2159,7 +2162,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2192,7 +2195,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2238,7 +2241,7 @@ define i16 @test_vector_reduce_smax_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2247,6 +2250,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smax_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2391,7 +2395,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2419,7 +2423,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2467,7 +2471,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2528,7 +2532,7 @@ define i16 @test_vector_reduce_smax_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
index 2a989ec..f15ecf0 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_smin_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i8:
@@ -1632,6 +1632,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1678,7 +1679,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_smin_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1692,7 +1693,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_smin_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1713,7 +1714,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_smin_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1747,7 +1748,7 @@ define i16 @test_vector_reduce_smin_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1900,6 +1901,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1961,7 +1963,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1977,7 +1979,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2003,7 +2005,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2041,7 +2043,7 @@ define i16 @test_vector_reduce_smin_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -2049,6 +2051,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_smin_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2139,7 +2142,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2159,7 +2162,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2192,7 +2195,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2238,7 +2241,7 @@ define i16 @test_vector_reduce_smin_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2391,7 +2394,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2419,7 +2422,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2467,7 +2470,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2528,7 +2531,7 @@ define i16 @test_vector_reduce_smin_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_i16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_i16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
index 69fd58a..e62165c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_umax_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i8:
@@ -1525,6 +1525,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1569,7 +1570,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_umax_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1583,7 +1584,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_umax_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1604,7 +1605,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_umax_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1638,7 +1639,7 @@ define i16 @test_vector_reduce_umax_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1782,6 +1783,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1841,7 +1843,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1857,7 +1859,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1883,7 +1885,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1921,7 +1923,7 @@ define i16 @test_vector_reduce_umax_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1929,6 +1931,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2017,7 +2020,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2037,7 +2040,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2070,7 +2073,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2116,7 +2119,7 @@ define i16 @test_vector_reduce_umax_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2125,6 +2128,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umax_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2267,7 +2271,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2295,7 +2299,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2343,7 +2347,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2404,7 +2408,7 @@ define i16 @test_vector_reduce_umax_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_max_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_max_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
index 1d3b42e..83ecaaa 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll
@@ -1,21 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
define i8 @test_vector_reduce_umin_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i8:
@@ -1271,6 +1271,7 @@ entry:
ret i8 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v2i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1312,7 +1313,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX9-GISEL-LABEL: test_vector_reduce_umin_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1326,7 +1327,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX10-GISEL-LABEL: test_vector_reduce_umin_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1347,7 +1348,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_umin_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1381,7 +1382,7 @@ define i16 @test_vector_reduce_umin_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1527,6 +1528,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v4i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1583,7 +1585,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1599,7 +1601,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1625,7 +1627,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1663,7 +1665,7 @@ define i16 @test_vector_reduce_umin_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1671,6 +1673,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v8i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -1756,7 +1759,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1776,7 +1779,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -1809,7 +1812,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1855,7 +1858,7 @@ define i16 @test_vector_reduce_umin_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1864,6 +1867,7 @@ entry:
ret i16 %res
}
+; FIXME: With -new-reg-bank-select, v_alignbit_b32 is regression. Need pattern to look through COPY.
define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_umin_v16i16:
; GFX7-SDAG: ; %bb.0: ; %entry
@@ -2003,7 +2007,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: s_nop 0
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_nop 0
-; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2031,7 +2035,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -2079,7 +2083,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2140,7 +2144,7 @@ define i16 @test_vector_reduce_umin_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_pk_min_u16 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_pk_min_u16 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/ARM/sincos.ll b/llvm/test/CodeGen/ARM/sincos.ll
index e1b683a..1a4313e 100644
--- a/llvm/test/CodeGen/ARM/sincos.ll
+++ b/llvm/test/CodeGen/ARM/sincos.ll
@@ -2,8 +2,7 @@
; RUN: llc < %s -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS
; RUN: llc < %s -mtriple=armv7-linux-gnu -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU
; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU
-; RUN: llc < %s -mtriple=armv7-linux-android -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT-ANDROID
-; RUN: llc < %s -mtriple=armv7-linux-android9 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU
+; RUN: llc < %s -mtriple=armv7-linux-android -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU
; Combine sin / cos into a single call unless they may write errno (as
; captured by readnone attrbiute, controlled by clang -fmath-errno
@@ -22,10 +21,6 @@ entry:
; NOOPT: bl _sinf
; NOOPT: bl _cosf
-; NOOPT-ANDROID-LABEL: test1:
-; NOOPT-ANDROID: bl sinf
-; NOOPT-ANDROID: bl cosf
-
%call = tail call float @sinf(float %x) readnone
%call1 = tail call float @cosf(float %x) readnone
%add = fadd float %call, %call1
@@ -44,10 +39,6 @@ entry:
; NOOPT: bl _sinf
; NOOPT: bl _cosf
-; NOOPT-ANDROID-LABEL: test1_fast:
-; NOOPT-ANDROID: bl sinf
-; NOOPT-ANDROID: bl cosf
-
%call = tail call fast float @sinf(float %x) readnone
%call1 = tail call fast float @cosf(float %x) readnone
%add = fadd float %call, %call1
@@ -68,10 +59,6 @@ entry:
; NOOPT: bl _sinf
; NOOPT: bl _cosf
-; NOOPT-ANDROID-LABEL: test1_errno:
-; NOOPT-ANDROID: bl sinf
-; NOOPT-ANDROID: bl cosf
-
%call = tail call float @sinf(float %x)
%call1 = tail call float @cosf(float %x)
%add = fadd float %call, %call1
@@ -90,10 +77,6 @@ entry:
; NOOPT: bl _sin
; NOOPT: bl _cos
-; NOOPT-ANDROID-LABEL: test2:
-; NOOPT-ANDROID: bl sin
-; NOOPT-ANDROID: bl cos
-
%call = tail call double @sin(double %x) readnone
%call1 = tail call double @cos(double %x) readnone
%add = fadd double %call, %call1
@@ -112,10 +95,6 @@ entry:
; NOOPT: bl _sin
; NOOPT: bl _cos
-; NOOPT-ANDROID-LABEL: test2_fast:
-; NOOPT-ANDROID: bl sin
-; NOOPT-ANDROID: bl cos
-
%call = tail call fast double @sin(double %x) readnone
%call1 = tail call fast double @cos(double %x) readnone
%add = fadd double %call, %call1
@@ -136,10 +115,6 @@ entry:
; NOOPT: bl _sin
; NOOPT: bl _cos
-; NOOPT-ANDROID-LABEL: test2_errno:
-; NOOPT-ANDROID: bl sin
-; NOOPT-ANDROID: bl cos
-
%call = tail call double @sin(double %x)
%call1 = tail call double @cos(double %x)
%add = fadd double %call, %call1
diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
index edec1d0..1957019 100644
--- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
+++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll
@@ -201,8 +201,9 @@ define signext i32 @test4(i32 signext %x, i32 signext %y, i32 signext %z) {
;
; RV32IXQCI-LABEL: test4:
; RV32IXQCI: # %bb.0:
-; RV32IXQCI-NEXT: li a0, 0
-; RV32IXQCI-NEXT: qc.lieqi a0, a2, 0, 3
+; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: li a1, 3
+; RV32IXQCI-NEXT: qc.selectieqi a0, 0, a1, 0
; RV32IXQCI-NEXT: ret
%c = icmp eq i32 %z, 0
%a = select i1 %c, i32 3, i32 0
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 1a7a72d..693a40d 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -142,6 +142,7 @@
; CHECK-NEXT: shvstvecd - 'Shvstvecd' (vstvec supports Direct mode).
; CHECK-NEXT: shxadd-load-fusion - Enable SH(1|2|3)ADD(.UW) + load macrofusion.
; CHECK-NEXT: sifive7 - SiFive 7-Series processors.
+; CHECK-NEXT: single-element-vec-fp64 - Certain vector FP64 operations produce a single result element per cycle.
; CHECK-NEXT: smaia - 'Smaia' (Advanced Interrupt Architecture Machine Level).
; CHECK-NEXT: smcdeleg - 'Smcdeleg' (Counter Delegation Machine Level).
; CHECK-NEXT: smcntrpmf - 'Smcntrpmf' (Cycle and Instret Privilege Mode Filtering).
diff --git a/llvm/test/CodeGen/RISCV/xqcicli.ll b/llvm/test/CodeGen/RISCV/xqcicli.ll
index 8d4caa1..cdb1947 100644
--- a/llvm/test/CodeGen/RISCV/xqcicli.ll
+++ b/llvm/test/CodeGen/RISCV/xqcicli.ll
@@ -23,7 +23,8 @@ define i32 @select_cc_example_eq(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eq:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieq a0, a1, a2, 11
+; RV32IXQCI-NEXT: qc.selectine a1, a2, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %b, %x
@@ -47,7 +48,8 @@ define i32 @select_cc_example_ne(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_ne:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.line a0, a1, a2, 11
+; RV32IXQCI-NEXT: qc.selectieq a1, a2, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %b, %x
@@ -167,7 +169,8 @@ define i32 @select_cc_example_eq_c(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eq_c:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.line a0, a1, a2, 11
+; RV32IXQCI-NEXT: qc.selectieq a1, a2, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %b, %x
@@ -191,7 +194,8 @@ define i32 @select_cc_example_ne_c(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_ne_c:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieq a0, a1, a2, 11
+; RV32IXQCI-NEXT: qc.selectine a1, a2, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %b, %x
@@ -312,7 +316,8 @@ define i32 @select_cc_example_eqi(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eqi:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %b, 12
@@ -337,7 +342,8 @@ define i32 @select_cc_example_nei(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_nei:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %b, 12
@@ -462,7 +468,8 @@ define i32 @select_cc_example_eqi_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eqi_c1:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 12, %b
@@ -487,7 +494,8 @@ define i32 @select_cc_example_nei_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_nei_c1:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 12, %b
@@ -612,7 +620,8 @@ define i32 @select_cc_example_eqi_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eqi_c2:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 12, %b
@@ -637,7 +646,8 @@ define i32 @select_cc_example_nei_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_nei_c2:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 12, %b
@@ -762,7 +772,8 @@ define i32 @select_cc_example_eqi_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eqi_c3:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectieqi a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %b, 12
@@ -787,7 +798,8 @@ define i32 @select_cc_example_nei_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_nei_c3:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a0, a1, 12, 11
+; RV32IXQCI-NEXT: qc.selectinei a1, 12, a0, 11
+; RV32IXQCI-NEXT: mv a0, a1
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %b, 12
diff --git a/llvm/test/CodeGen/RISCV/xqcics.ll b/llvm/test/CodeGen/RISCV/xqcics.ll
index c0839c9..7656a0c 100644
--- a/llvm/test/CodeGen/RISCV/xqcics.ll
+++ b/llvm/test/CodeGen/RISCV/xqcics.ll
@@ -270,8 +270,7 @@ define i32 @select_cc_example_eqi(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eqi:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.line a2, a0, a1, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectieq a0, a1, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %a, %b
@@ -301,8 +300,7 @@ define i32 @select_cc_example_eqi_c(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eqi_c:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieq a2, a0, a1, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectine a0, a1, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %a, %b
@@ -332,8 +330,7 @@ define i32 @select_cc_example_nei(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_nei:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieq a2, a0, a1, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectine a0, a1, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %a, %b
@@ -363,8 +360,7 @@ define i32 @select_cc_example_nei_c(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_nei_c:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.line a2, a0, a1, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectieq a0, a1, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %a, %b
@@ -395,8 +391,7 @@ define i32 @select_cc_example_ieqi(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_ieqi:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %a, 12
@@ -427,8 +422,7 @@ define i32 @select_cc_example_ieqi_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_ieqi_c1:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 12, %a
@@ -459,8 +453,7 @@ define i32 @select_cc_example_ieqi_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_ieqi_c2:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %a, 12
@@ -491,8 +484,7 @@ define i32 @select_cc_example_ieqi_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_ieqi_c3:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 12, %a
@@ -523,8 +515,7 @@ define i32 @select_cc_example_inei(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_inei:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %a, 12
@@ -555,8 +546,7 @@ define i32 @select_cc_example_inei_c1(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_inei_c1:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieqi a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectinei a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 12, %a
@@ -587,8 +577,7 @@ define i32 @select_cc_example_inei_c2(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_inei_c2:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %a, 12
@@ -619,8 +608,7 @@ define i32 @select_cc_example_inei_c3(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_inei_c3:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.linei a2, a0, 12, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectieqi a0, 12, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 12, %a
@@ -712,8 +700,7 @@ define i32 @select_cc_example_eq1(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_eq1:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.line a2, a1, a0, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectieq a0, a1, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp eq i32 %b, %a
@@ -743,8 +730,7 @@ define i32 @select_cc_example_ne1(i32 %a, i32 %b, i32 %x, i32 %y) {
;
; RV32IXQCI-LABEL: select_cc_example_ne1:
; RV32IXQCI: # %bb.0: # %entry
-; RV32IXQCI-NEXT: qc.lieq a2, a1, a0, 11
-; RV32IXQCI-NEXT: mv a0, a2
+; RV32IXQCI-NEXT: qc.selectine a0, a1, a2, 11
; RV32IXQCI-NEXT: ret
entry:
%cmp = icmp ne i32 %b, %a
diff --git a/llvm/test/CodeGen/X86/fast-isel-fneg.ll b/llvm/test/CodeGen/X86/fast-isel-fneg.ll
deleted file mode 100644
index 128f5ee..0000000
--- a/llvm/test/CodeGen/X86/fast-isel-fneg.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -fast-isel-abort=3 -mtriple=x86_64-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -fast-isel -mtriple=i686-- -mattr=+sse2 | FileCheck --check-prefix=SSE2 %s
-
-define double @fneg_f64(double %x) nounwind {
-; CHECK-LABEL: fneg_f64:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movq %xmm0, %rax
-; CHECK-NEXT: movabsq $-9223372036854775808, %rcx ## imm = 0x8000000000000000
-; CHECK-NEXT: xorq %rax, %rcx
-; CHECK-NEXT: movq %rcx, %xmm0
-; CHECK-NEXT: retq
-;
-; SSE2-LABEL: fneg_f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pushl %ebp
-; SSE2-NEXT: movl %esp, %ebp
-; SSE2-NEXT: andl $-8, %esp
-; SSE2-NEXT: subl $8, %esp
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; SSE2-NEXT: movlps %xmm0, (%esp)
-; SSE2-NEXT: fldl (%esp)
-; SSE2-NEXT: movl %ebp, %esp
-; SSE2-NEXT: popl %ebp
-; SSE2-NEXT: retl
- %y = fneg double %x
- ret double %y
-}
-
-define float @fneg_f32(float %x) nounwind {
-; CHECK-LABEL: fneg_f32:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: xorl $2147483648, %eax ## imm = 0x80000000
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: retq
-;
-; SSE2-LABEL: fneg_f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pushl %eax
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; SSE2-NEXT: movss %xmm0, (%esp)
-; SSE2-NEXT: flds (%esp)
-; SSE2-NEXT: popl %eax
-; SSE2-NEXT: retl
- %y = fneg float %x
- ret float %y
-}
-
-define void @fneg_f64_mem(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: fneg_f64_mem:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: movq %xmm0, %rax
-; CHECK-NEXT: movabsq $-9223372036854775808, %rcx ## imm = 0x8000000000000000
-; CHECK-NEXT: xorq %rax, %rcx
-; CHECK-NEXT: movq %rcx, %xmm0
-; CHECK-NEXT: movq %xmm0, (%rsi)
-; CHECK-NEXT: retq
-;
-; SSE2-LABEL: fneg_f64_mem:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; SSE2-NEXT: movsd %xmm0, (%eax)
-; SSE2-NEXT: retl
- %a = load double, ptr %x
- %b = fneg double %a
- store double %b, ptr %y
- ret void
-}
-
-define void @fneg_f32_mem(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: fneg_f32_mem:
-; CHECK: ## %bb.0:
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd %xmm0, %eax
-; CHECK-NEXT: xorl $2147483648, %eax ## imm = 0x80000000
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: movd %xmm0, (%rsi)
-; CHECK-NEXT: retq
-;
-; SSE2-LABEL: fneg_f32_mem:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd %xmm0, %ecx
-; SSE2-NEXT: xorl $2147483648, %ecx # imm = 0x80000000
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: movd %xmm0, (%eax)
-; SSE2-NEXT: retl
- %a = load float, ptr %x
- %b = fneg float %a
- store float %b, ptr %y
- ret void
-}
diff --git a/llvm/test/CodeGen/X86/fp-int-fp-cvt.ll b/llvm/test/CodeGen/X86/fp-int-fp-cvt.ll
new file mode 100644
index 0000000..b6c17ce
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-int-fp-cvt.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512,AVX512-VL
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=x86-64-v4 -mattr=-avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512-NOVL
+
+;
+; fptosi -> sitofp
+;
+
+define double @scvtf64_i32(double %a0) {
+; SSE-LABEL: scvtf64_i32:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: scvtf64_i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ii = fptosi double %a0 to i32
+ %ff = sitofp i32 %ii to double
+ ret double %ff
+}
+
+define double @scvtf64_i64(double %a0) {
+; SSE-LABEL: scvtf64_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2sd %rax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: scvtf64_i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttsd2si %xmm0, %rax
+; AVX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0
+; AVX-NEXT: retq
+ %ii = fptosi double %a0 to i64
+ %ff = sitofp i64 %ii to double
+ ret double %ff
+}
+
+define float @scvtf32_i32(float %a0) {
+; SSE-LABEL: scvtf32_i32:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: scvtf32_i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ii = fptosi float %a0 to i32
+ %ff = sitofp i32 %ii to float
+ ret float %ff
+}
+
+define float @scvtf32_i64(float %a0) {
+; SSE-LABEL: scvtf32_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ss %rax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: scvtf32_i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttss2si %xmm0, %rax
+; AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
+; AVX-NEXT: retq
+ %ii = fptosi float %a0 to i64
+ %ff = sitofp i64 %ii to float
+ ret float %ff
+}
+
+;
+; fptoui -> uitofp
+;
+
+define double @ucvtf64_i32(double %a0) {
+; SSE-LABEL: ucvtf64_i32:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movl %eax, %eax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2sd %rax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: ucvtf64_i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vcvttsd2si %xmm0, %rax
+; AVX2-NEXT: movl %eax, %eax
+; AVX2-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ucvtf64_i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm0
+; AVX512-NEXT: retq
+ %ii = fptoui double %a0 to i32
+ %ff = uitofp i32 %ii to double
+ ret double %ff
+}
+
+define double @ucvtf64_i64(double %a0) {
+; SSE-LABEL: ucvtf64_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: cvttsd2si %xmm0, %rdx
+; SSE-NEXT: sarq $63, %rcx
+; SSE-NEXT: andq %rcx, %rdx
+; SSE-NEXT: orq %rax, %rdx
+; SSE-NEXT: movq %rdx, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: ucvtf64_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vcvttsd2si %xmm0, %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: sarq $63, %rcx
+; AVX2-NEXT: vcvttsd2si %xmm0, %rdx
+; AVX2-NEXT: andq %rcx, %rdx
+; AVX2-NEXT: orq %rax, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ucvtf64_i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvttsd2usi %xmm0, %rax
+; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0
+; AVX512-NEXT: retq
+ %ii = fptoui double %a0 to i64
+ %ff = uitofp i64 %ii to double
+ ret double %ff
+}
+
+define float @ucvtf32_i32(float %a0) {
+; SSE-LABEL: ucvtf32_i32:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: movl %eax, %eax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ss %rax, %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: ucvtf32_i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vcvttss2si %xmm0, %rax
+; AVX2-NEXT: movl %eax, %eax
+; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ucvtf32_i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvttss2usi %xmm0, %eax
+; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0
+; AVX512-NEXT: retq
+ %ii = fptoui float %a0 to i32
+ %ff = uitofp i32 %ii to float
+ ret float %ff
+}
+
+define float @ucvtf32_i64(float %a0) {
+; SSE-LABEL: ucvtf32_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: cvttss2si %xmm0, %rcx
+; SSE-NEXT: movq %rcx, %rdx
+; SSE-NEXT: sarq $63, %rdx
+; SSE-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: andq %rdx, %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: js .LBB7_1
+; SSE-NEXT: # %bb.2:
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ss %rax, %xmm0
+; SSE-NEXT: retq
+; SSE-NEXT: .LBB7_1:
+; SSE-NEXT: movq %rax, %rcx
+; SSE-NEXT: shrq %rcx
+; SSE-NEXT: andl $1, %eax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cvtsi2ss %rax, %xmm0
+; SSE-NEXT: addss %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: ucvtf32_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vcvttss2si %xmm0, %rcx
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: sarq $63, %rdx
+; AVX2-NEXT: vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vcvttss2si %xmm0, %rax
+; AVX2-NEXT: andq %rdx, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: js .LBB7_1
+; AVX2-NEXT: # %bb.2:
+; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
+; AVX2-NEXT: retq
+; AVX2-NEXT: .LBB7_1:
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq %rcx
+; AVX2-NEXT: andl $1, %eax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
+; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ucvtf32_i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvttss2usi %xmm0, %rax
+; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0
+; AVX512-NEXT: retq
+ %ii = fptoui float %a0 to i64
+ %ff = uitofp i64 %ii to float
+ ret float %ff
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512-NOVL: {{.*}}
+; AVX512-VL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/isel-fneg.ll b/llvm/test/CodeGen/X86/isel-fneg.ll
new file mode 100644
index 0000000..77b3f26
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-fneg.ll
@@ -0,0 +1,208 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X86,FASTISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
+; DISABLED: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,FASTISEL-SSE-X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,SDAG-SSE-X86
+; DISABLED: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 -mattr=+sse | FileCheck %s --check-prefixes=X86,SSE-X86,GISEL-SSE-X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,FASTISEL-SSE-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,SDAG-SSE-X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=2 -mattr=+sse | FileCheck %s --check-prefixes=X64,SSE-X64,GISEL-SSE-X64
+
+define double @fneg_f64(double %x) nounwind {
+; X86-LABEL: fneg_f64:
+; X86: # %bb.0:
+; X86-NEXT: fldl {{[0-9]+}}(%esp)
+; X86-NEXT: fchs
+; X86-NEXT: retl
+;
+; FASTISEL-SSE-X64-LABEL: fneg_f64:
+; FASTISEL-SSE-X64: # %bb.0:
+; FASTISEL-SSE-X64-NEXT: movq %xmm0, %rax
+; FASTISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; FASTISEL-SSE-X64-NEXT: xorq %rax, %rcx
+; FASTISEL-SSE-X64-NEXT: movq %rcx, %xmm0
+; FASTISEL-SSE-X64-NEXT: retq
+;
+; SDAG-SSE-X64-LABEL: fneg_f64:
+; SDAG-SSE-X64: # %bb.0:
+; SDAG-SSE-X64-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-SSE-X64-NEXT: retq
+;
+; GISEL-SSE-X64-LABEL: fneg_f64:
+; GISEL-SSE-X64: # %bb.0:
+; GISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; GISEL-SSE-X64-NEXT: movq %xmm0, %rcx
+; GISEL-SSE-X64-NEXT: xorq %rax, %rcx
+; GISEL-SSE-X64-NEXT: movq %rcx, %xmm0
+; GISEL-SSE-X64-NEXT: retq
+ %y = fneg double %x
+ ret double %y
+}
+
+define float @fneg_f32(float %x) nounwind {
+; FASTISEL-X86-LABEL: fneg_f32:
+; FASTISEL-X86: # %bb.0:
+; FASTISEL-X86-NEXT: flds {{[0-9]+}}(%esp)
+; FASTISEL-X86-NEXT: fchs
+; FASTISEL-X86-NEXT: retl
+;
+; SDAG-X86-LABEL: fneg_f32:
+; SDAG-X86: # %bb.0:
+; SDAG-X86-NEXT: flds {{[0-9]+}}(%esp)
+; SDAG-X86-NEXT: fchs
+; SDAG-X86-NEXT: retl
+;
+; SSE-X86-LABEL: fneg_f32:
+; SSE-X86: # %bb.0:
+; SSE-X86-NEXT: pushl %eax
+; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; SSE-X86-NEXT: movss %xmm0, (%esp)
+; SSE-X86-NEXT: flds (%esp)
+; SSE-X86-NEXT: popl %eax
+; SSE-X86-NEXT: retl
+;
+; FASTISEL-SSE-X64-LABEL: fneg_f32:
+; FASTISEL-SSE-X64: # %bb.0:
+; FASTISEL-SSE-X64-NEXT: movd %xmm0, %eax
+; FASTISEL-SSE-X64-NEXT: xorl $2147483648, %eax # imm = 0x80000000
+; FASTISEL-SSE-X64-NEXT: movd %eax, %xmm0
+; FASTISEL-SSE-X64-NEXT: retq
+;
+; SDAG-SSE-X64-LABEL: fneg_f32:
+; SDAG-SSE-X64: # %bb.0:
+; SDAG-SSE-X64-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-SSE-X64-NEXT: retq
+;
+; GISEL-SSE-X64-LABEL: fneg_f32:
+; GISEL-SSE-X64: # %bb.0:
+; GISEL-SSE-X64-NEXT: movd %xmm0, %eax
+; GISEL-SSE-X64-NEXT: addl $-2147483648, %eax # imm = 0x80000000
+; GISEL-SSE-X64-NEXT: movd %eax, %xmm0
+; GISEL-SSE-X64-NEXT: retq
+ %y = fneg float %x
+ ret float %y
+}
+
+define void @fneg_f64_mem(ptr %x, ptr %y) nounwind {
+; X86-LABEL: fneg_f64_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: fldl (%ecx)
+; X86-NEXT: fchs
+; X86-NEXT: fstpl (%eax)
+; X86-NEXT: retl
+;
+; FASTISEL-SSE-X64-LABEL: fneg_f64_mem:
+; FASTISEL-SSE-X64: # %bb.0:
+; FASTISEL-SSE-X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; FASTISEL-SSE-X64-NEXT: movq %xmm0, %rax
+; FASTISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; FASTISEL-SSE-X64-NEXT: xorq %rax, %rcx
+; FASTISEL-SSE-X64-NEXT: movq %rcx, %xmm0
+; FASTISEL-SSE-X64-NEXT: movq %xmm0, (%rsi)
+; FASTISEL-SSE-X64-NEXT: retq
+;
+; SDAG-SSE-X64-LABEL: fneg_f64_mem:
+; SDAG-SSE-X64: # %bb.0:
+; SDAG-SSE-X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SDAG-SSE-X64-NEXT: xorq (%rdi), %rax
+; SDAG-SSE-X64-NEXT: movq %rax, (%rsi)
+; SDAG-SSE-X64-NEXT: retq
+;
+; GISEL-SSE-X64-LABEL: fneg_f64_mem:
+; GISEL-SSE-X64: # %bb.0:
+; GISEL-SSE-X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; GISEL-SSE-X64-NEXT: xorq (%rdi), %rax
+; GISEL-SSE-X64-NEXT: movq %rax, (%rsi)
+; GISEL-SSE-X64-NEXT: retq
+ %a = load double, ptr %x
+ %b = fneg double %a
+ store double %b, ptr %y
+ ret void
+}
+
+define void @fneg_f32_mem(ptr %x, ptr %y) nounwind {
+; FASTISEL-X86-LABEL: fneg_f32_mem:
+; FASTISEL-X86: # %bb.0:
+; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
+; FASTISEL-X86-NEXT: xorl (%ecx), %edx
+; FASTISEL-X86-NEXT: movl %edx, (%eax)
+; FASTISEL-X86-NEXT: retl
+;
+; SDAG-X86-LABEL: fneg_f32_mem:
+; SDAG-X86: # %bb.0:
+; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SDAG-X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
+; SDAG-X86-NEXT: xorl (%ecx), %edx
+; SDAG-X86-NEXT: movl %edx, (%eax)
+; SDAG-X86-NEXT: retl
+;
+; FASTISEL-SSE-X86-LABEL: fneg_f32_mem:
+; FASTISEL-SSE-X86: # %bb.0:
+; FASTISEL-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FASTISEL-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FASTISEL-SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FASTISEL-SSE-X86-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; FASTISEL-SSE-X86-NEXT: movss %xmm0, (%eax)
+; FASTISEL-SSE-X86-NEXT: retl
+;
+; SDAG-SSE-X86-LABEL: fneg_f32_mem:
+; SDAG-SSE-X86: # %bb.0:
+; SDAG-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SDAG-SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SDAG-SSE-X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
+; SDAG-SSE-X86-NEXT: xorl (%ecx), %edx
+; SDAG-SSE-X86-NEXT: movl %edx, (%eax)
+; SDAG-SSE-X86-NEXT: retl
+;
+; FASTISEL-SSE-X64-LABEL: fneg_f32_mem:
+; FASTISEL-SSE-X64: # %bb.0:
+; FASTISEL-SSE-X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FASTISEL-SSE-X64-NEXT: movd %xmm0, %eax
+; FASTISEL-SSE-X64-NEXT: xorl $2147483648, %eax # imm = 0x80000000
+; FASTISEL-SSE-X64-NEXT: movd %eax, %xmm0
+; FASTISEL-SSE-X64-NEXT: movd %xmm0, (%rsi)
+; FASTISEL-SSE-X64-NEXT: retq
+;
+; SDAG-SSE-X64-LABEL: fneg_f32_mem:
+; SDAG-SSE-X64: # %bb.0:
+; SDAG-SSE-X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; SDAG-SSE-X64-NEXT: xorl (%rdi), %eax
+; SDAG-SSE-X64-NEXT: movl %eax, (%rsi)
+; SDAG-SSE-X64-NEXT: retq
+;
+; GISEL-SSE-X64-LABEL: fneg_f32_mem:
+; GISEL-SSE-X64: # %bb.0:
+; GISEL-SSE-X64-NEXT: movl $-2147483648, %eax # imm = 0x80000000
+; GISEL-SSE-X64-NEXT: xorl (%rdi), %eax
+; GISEL-SSE-X64-NEXT: movl %eax, (%rsi)
+; GISEL-SSE-X64-NEXT: retq
+ %a = load float, ptr %x
+ %b = fneg float %a
+ store float %b, ptr %y
+ ret void
+}
+
+define x86_fp80 @test_fp80(x86_fp80 %a) nounwind {
+; X86-LABEL: test_fp80:
+; X86: # %bb.0:
+; X86-NEXT: fldt {{[0-9]+}}(%esp)
+; X86-NEXT: fchs
+; X86-NEXT: retl
+;
+; X64-LABEL: test_fp80:
+; X64: # %bb.0:
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fchs
+; X64-NEXT: retq
+ %1 = fneg x86_fp80 %a
+ ret x86_fp80 %1
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE-X64: {{.*}}
diff --git a/llvm/test/CodeGen/X86/stack-protector-target.ll b/llvm/test/CodeGen/X86/stack-protector-target.ll
index f7c5680..4ba0302 100644
--- a/llvm/test/CodeGen/X86/stack-protector-target.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-target.ll
@@ -2,13 +2,8 @@
; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=I386-TLS %s
; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=X64-TLS %s
-; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=I386 %s
-; RUN: llc -mtriple=i386-linux-android16 < %s -o - | FileCheck --check-prefix=I386 %s
-; RUN: llc -mtriple=i386-linux-android17 < %s -o - | FileCheck --check-prefix=I386-TLS %s
-; RUN: llc -mtriple=i386-linux-android24 < %s -o - | FileCheck --check-prefix=I386-TLS %s
+; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=I386-TLS %s
; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=X64-TLS %s
-; RUN: llc -mtriple=x86_64-linux-android17 < %s -o - | FileCheck --check-prefix=X64-TLS %s
-; RUN: llc -mtriple=x86_64-linux-android24 < %s -o - | FileCheck --check-prefix=X64-TLS %s
; RUN: llc -mtriple=i386-kfreebsd < %s -o - | FileCheck --check-prefix=I386-TLS %s
; RUN: llc -mtriple=x86_64-kfreebsd < %s -o - | FileCheck --check-prefix=X64-TLS %s
@@ -27,11 +22,6 @@ declare void @_Z7CapturePi(ptr)
; X64-TLS: movq %fs:40, %[[C:.*]]
; X64-TLS: cmpq 16(%rsp), %[[C]]
-; I386: movl __stack_chk_guard, %[[B:.*]]
-; I386: movl %[[B]], 8(%esp)
-; I386: movl __stack_chk_guard, %[[C:.*]]
-; I386: cmpl 8(%esp), %[[C]]
-
; I386-TLS: movl %gs:20, %[[B:.*]]
; I386-TLS: movl %[[B]], 8(%esp)
; I386-TLS: movl %gs:20, %[[C:.*]]
diff --git a/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll b/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll
index 8342d42..5928127 100644
--- a/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll
+++ b/llvm/test/DebugInfo/X86/DW_OP_LLVM_extract_bits.ll
@@ -67,6 +67,8 @@ entry:
; CHECK: DW_TAG_variable
; CHECK: DW_AT_location (DW_OP_fbreg -4, DW_OP_plus_uconst 0x3, DW_OP_deref_size 0x1, DW_OP_constu 0x38, DW_OP_shl, DW_OP_constu 0x39, DW_OP_shr, DW_OP_stack_value)
; CHECK: DW_AT_name ("z")
+; CHECK: DW_AT_location (DW_OP_fbreg -4, DW_OP_deref_size 0x8, DW_OP_stack_value)
+; CHECK: DW_AT_name ("q")
define i32 @test4() !dbg !28 {
entry:
@@ -74,6 +76,7 @@ entry:
tail call void @llvm.dbg.declare(metadata ptr %0, metadata !29, metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 31, 1)), !dbg !30
tail call void @llvm.dbg.declare(metadata ptr %0, metadata !31, metadata !DIExpression(DW_OP_LLVM_extract_bits_sext, 1, 31)), !dbg !30
tail call void @llvm.dbg.declare(metadata ptr %0, metadata !32, metadata !DIExpression(DW_OP_plus_uconst, 3, DW_OP_LLVM_extract_bits_zext, 1, 7)), !dbg !30
+ tail call void @llvm.dbg.declare(metadata ptr %0, metadata !33, metadata !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 64)), !dbg !30
ret i32 0, !dbg !30
}
@@ -116,3 +119,5 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
!30 = !DILocation(line: 0, scope: !28)
!31 = !DILocalVariable(name: "y", scope: !28, file: !3, type: !19)
!32 = !DILocalVariable(name: "z", scope: !28, file: !3, type: !9)
+!33 = !DILocalVariable(name: "q", scope: !28, file: !3, type: !34)
+!34 = !DIBasicType(name: "uint64_t", size: 64, encoding: DW_ATE_unsigned)
diff --git a/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll b/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll
index f198fc0..659d6aa 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll
@@ -4,13 +4,10 @@
; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC
; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC-NOREMAT
-; Pre-Lollipop Android does not support ifunc.
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android20 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android21 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC
+; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv7--linux-android22"
+target triple = "armv7--linux-android"
; CHECK-IFUNC: @__asan_shadow = external global [0 x i8]
; CHECK-NOIFUNC: @__asan_shadow_memory_dynamic_address = external global i32
diff --git a/llvm/test/MC/Mips/branch-pseudos-bad.s b/llvm/test/MC/Mips/branch-pseudos-bad.s
index c23164d..9633414 100644
--- a/llvm/test/MC/Mips/branch-pseudos-bad.s
+++ b/llvm/test/MC/Mips/branch-pseudos-bad.s
@@ -1,5 +1,13 @@
# RUN: not llvm-mc %s -triple=mips -mcpu=mips32 2>&1 | FileCheck %s
+# CHECK: error: invalid operand for instruction
+ beql $t0, ($t0), 1
+# CHECK: error: invalid operand for instruction
+ bne $t0, ($t0), 1
+# CHECK: error: invalid operand for instruction
+ beq $t0, ($t0), 1
+
+
# Check for errors when using conditional branch pseudos after .set noat.
.set noat
local_label:
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
index 98a376b..2904474 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-calling-conv.td
@@ -48,6 +48,11 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
// CHECK-NEXT: Entry = DefaultCC;
// CHECK-NEXT: }
// CHECK-EMPTY:
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x0000000000001a
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
// CHECK-NEXT: {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
// CHECK-NEXT: };
@@ -70,9 +75,14 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
// CHECK-NEXT: }
// CHECK-EMPTY:
// CHECK-NEXT: if (TT.getArch() == Triple::avr) {
-// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT: {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
-// CHECK-NEXT: };
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x0000000000001a
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
+// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
+// CHECK-NEXT: {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
+// CHECK-NEXT: };
// CHECK-EMPTY:
// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) {
// CHECK-NEXT: setLibcallImpl(Func, Impl);
@@ -92,6 +102,11 @@ def MSP430LibraryWithCondCC : SystemRuntimeLibrary<isMSP430,
// CHECK-NEXT: }
// CHECK-EMPTY:
// CHECK-NEXT: if (TT.getArch() == Triple::msp430) {
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x00000000000010
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
// CHECK-NEXT: {RTLIB::MALLOC, RTLIB::impl_malloc}, // malloc
// CHECK-NEXT: };
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
index 136c81b..f9a148a 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter-conflict-warning.td
@@ -25,9 +25,22 @@ def dup1 : RuntimeLibcallImpl<ANOTHER_DUP>;
// func_a and func_b both provide SOME_FUNC.
// CHECK: if (isTargetArchA()) {
-// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_b}, // func_b
-// CHECK-NEXT: };
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x00000000000018
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
+
+// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
+// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_b}, // func_b
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) {
+// CHECK-NEXT: setLibcallImpl(Func, Impl);
+// CHECK-NEXT: }
+// CHECK-EMPTY:
+// CHECK-NEXT: return;
+// CHECK-NEXT: }
// ERR: :[[@LINE+1]]:5: warning: conflicting implementations for libcall SOME_FUNC: func_b, func_a
def TheSystemLibraryA : SystemRuntimeLibrary<isTargetArchA,
@@ -35,10 +48,22 @@ def TheSystemLibraryA : SystemRuntimeLibrary<isTargetArchA,
>;
// CHECK: if (isTargetArchB()) {
-// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT: {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
-// CHECK-NEXT: };
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x00000000000058
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
+// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
+// CHECK-NEXT: {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
+// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) {
+// CHECK-NEXT: setLibcallImpl(Func, Impl);
+// CHECK-NEXT: }
+// CHECK-EMPTY:
+// CHECK-NEXT: return;
+// CHECK-NEXT: }
// ERR: :[[@LINE+1]]:5: warning: conflicting implementations for libcall SOME_FUNC: func_a, func_b
def TheSystemLibraryB : SystemRuntimeLibrary<isTargetArchB,
@@ -46,11 +71,23 @@ def TheSystemLibraryB : SystemRuntimeLibrary<isTargetArchB,
>;
// CHECK: if (isTargetArchC()) {
-// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT: {RTLIB::ANOTHER_DUP, RTLIB::impl_dup1}, // dup1
-// CHECK-NEXT: {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
-// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
-// CHECK-NEXT: };
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x0000000000007e
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
+// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
+// CHECK-NEXT: {RTLIB::ANOTHER_DUP, RTLIB::impl_dup1}, // dup1
+// CHECK-NEXT: {RTLIB::OTHER_FUNC, RTLIB::impl_other_func}, // other_func
+// CHECK-NEXT: {RTLIB::SOME_FUNC, RTLIB::impl_func_a}, // func_a
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) {
+// CHECK-NEXT: setLibcallImpl(Func, Impl);
+// CHECK-NEXT: }
+// CHECK-EMPTY:
+// CHECK-NEXT: return;
+// CHECK-NEXT: }
// ERR: :[[@LINE+3]]:5: warning: conflicting implementations for libcall ANOTHER_DUP: dup1, dup0
// ERR: :[[@LINE+2]]:5: warning: conflicting implementations for libcall SOME_FUNC: func_a, func_b
diff --git a/llvm/test/TableGen/RuntimeLibcallEmitter.td b/llvm/test/TableGen/RuntimeLibcallEmitter.td
index c336fee..0c23e3b 100644
--- a/llvm/test/TableGen/RuntimeLibcallEmitter.td
+++ b/llvm/test/TableGen/RuntimeLibcallEmitter.td
@@ -196,15 +196,20 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
// CHECK-NEXT: };
// CHECK-EMPTY:
// CHECK-NEXT: if (TT.getArch() == Triple::blah) {
-// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
-// CHECK-NEXT: {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
-// CHECK-NEXT: {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
-// CHECK-NEXT: {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
-// CHECK-NEXT: };
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x000000000000e0
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
// CHECK-EMPTY:
-// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) {
-// CHECK-NEXT: setLibcallImpl(Func, Impl);
-// CHECK-NEXT: }
+// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
+// CHECK-NEXT: {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
+// CHECK-NEXT: {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
+// CHECK-NEXT: {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
+// CHECK-NEXT: };
+// CHECK-EMPTY:
+// CHECK-NEXT: for (const auto [Func, Impl] : LibraryCalls) {
+// CHECK-NEXT: setLibcallImpl(Func, Impl);
+// CHECK-NEXT: }
// CHECK-EMPTY:
// CHECK-NEXT: if (TT.hasCompilerRT()) {
// CHECK-NEXT: static const LibcallImplPair LibraryCalls_hasCompilerRT[] = {
@@ -233,6 +238,11 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
// CHECK-NEXT: }
// CHECK-EMPTY:
// CHECK-NEXT: if (TT.getArch() == Triple::buzz) {
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x00000000000118
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
// CHECK-NEXT: {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
// CHECK-NEXT: {RTLIB::SQRT_F80, RTLIB::impl_sqrtl_f80}, // sqrtl
@@ -247,6 +257,11 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
// CHECK-NEXT: }
// CHECK-EMPTY:
// CHECK-NEXT: if (TT.getArch() == Triple::foo) {
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x000000000000a0
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
// CHECK-NEXT: {RTLIB::BZERO, RTLIB::impl_bzero}, // bzero
// CHECK-NEXT: {RTLIB::SQRT_F128, RTLIB::impl_sqrtl_f128}, // sqrtl
@@ -271,6 +286,11 @@ def BlahLibrary : SystemRuntimeLibrary<isBlahArch, (add calloc, LibraryWithCondi
// CHECK-NEXT: }
// CHECK-EMPTY:
// CHECK-NEXT: if (TT.getArch() == Triple::simple) {
+// CHECK-NEXT: static constexpr LibcallImplBitset SystemAvailableImpls({
+// CHECK-NEXT: 0x00000000000158
+// CHECK-NEXT: });
+// CHECK-NEXT: AvailableLibcallImpls = SystemAvailableImpls;
+// CHECK-EMPTY:
// CHECK-NEXT: static const LibcallImplPair LibraryCalls[] = {
// CHECK-NEXT: {RTLIB::CALLOC, RTLIB::impl_calloc}, // calloc
// CHECK-NEXT: {RTLIB::SHL_I32, RTLIB::impl___ashlsi3}, // __ashlsi3
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/baseoffs-sext-bug.ll b/llvm/test/Transforms/CodeGenPrepare/X86/baseoffs-sext-bug.ll
new file mode 100644
index 0000000..51a461e
--- /dev/null
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/baseoffs-sext-bug.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' < %s | FileCheck --check-prefix=GEP %s
+; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' -addr-sink-using-gep=false < %s | FileCheck --check-prefix=NO-GEP %s
+
+target triple = "x86_64--linux-gnu"
+target datalayout = "e-m:e-p0:128:128-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+; -p0:128:128 is added to ensure that transformation will be triggered.
+
+define i128 @test(i128 %arg) {
+; GEP-LABEL: define i128 @test(
+; GEP-SAME: i128 [[ARG:%.*]]) {
+; GEP-NEXT: [[ENTRY:.*]]:
+; GEP-NEXT: [[CMP:%.*]] = icmp ugt i128 [[ARG]], 10
+; GEP-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[EXIT:.*]]
+; GEP: [[THEN]]:
+; GEP-NEXT: [[SUNKADDR:%.*]] = inttoptr i128 [[ARG]] to ptr
+; GEP-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[SUNKADDR]], i128 -32
+; GEP-NEXT: [[LOAD:%.*]] = load i128, ptr [[SUNKADDR1]], align 16
+; GEP-NEXT: br label %[[EXIT]]
+; GEP: [[EXIT]]:
+; GEP-NEXT: [[PHI:%.*]] = phi i128 [ [[LOAD]], %[[THEN]] ], [ 0, %[[ENTRY]] ]
+; GEP-NEXT: ret i128 [[PHI]]
+;
+; NO-GEP-LABEL: define i128 @test(
+; NO-GEP-SAME: i128 [[ARG:%.*]]) {
+; NO-GEP-NEXT: [[ENTRY:.*]]:
+; NO-GEP-NEXT: [[CMP:%.*]] = icmp ugt i128 [[ARG]], 10
+; NO-GEP-NEXT: br i1 [[CMP]], label %[[THEN:.*]], label %[[EXIT:.*]]
+; NO-GEP: [[THEN]]:
+; NO-GEP-NEXT: [[SUNKADDR:%.*]] = add i128 [[ARG]], -32
+; NO-GEP-NEXT: [[SUNKADDR1:%.*]] = inttoptr i128 [[SUNKADDR]] to ptr
+; NO-GEP-NEXT: [[LOAD:%.*]] = load i128, ptr [[SUNKADDR1]], align 16
+; NO-GEP-NEXT: br label %[[EXIT]]
+; NO-GEP: [[EXIT]]:
+; NO-GEP-NEXT: [[PHI:%.*]] = phi i128 [ [[LOAD]], %[[THEN]] ], [ 0, %[[ENTRY]] ]
+; NO-GEP-NEXT: ret i128 [[PHI]]
+;
+entry:
+ %add = add i128 %arg, -32
+ %cmp = icmp ugt i128 %arg, 10
+ br i1 %cmp, label %then, label %exit
+
+then:
+ %inttoptr = inttoptr i128 %add to ptr
+ %load = load i128, ptr %inttoptr, align 16
+ br label %exit
+
+exit:
+ %phi = phi i128 [ %load, %then ], [ 0, %entry ]
+ ret i128 %phi
+}
+
+define void @test_combine(ptr %ptr, i128 %arg) {
+; GEP-LABEL: define void @test_combine(
+; GEP-SAME: ptr [[PTR:%.*]], i128 [[ARG:%.*]]) {
+; GEP-NEXT: [[ENTRY:.*:]]
+; GEP-NEXT: [[CMP:%.*]] = icmp ugt i128 [[ARG]], 10
+; GEP-NEXT: [[SELECT1:%.*]] = select i1 [[CMP]], i128 -32, i128 0
+; GEP-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i128 [[SELECT1]]
+; GEP-NEXT: store i128 1, ptr [[SUNKADDR]], align 16
+; GEP-NEXT: ret void
+;
+; NO-GEP-LABEL: define void @test_combine(
+; NO-GEP-SAME: ptr [[PTR:%.*]], i128 [[ARG:%.*]]) {
+; NO-GEP-NEXT: [[ENTRY:.*:]]
+; NO-GEP-NEXT: [[CMP:%.*]] = icmp ugt i128 [[ARG]], 10
+; NO-GEP-NEXT: [[SELECT1:%.*]] = select i1 [[CMP]], i128 -32, i128 0
+; NO-GEP-NEXT: [[SUNKADDR:%.*]] = ptrtoint ptr [[PTR]] to i128
+; NO-GEP-NEXT: [[SUNKADDR2:%.*]] = add i128 [[SUNKADDR]], [[SELECT1]]
+; NO-GEP-NEXT: [[SUNKADDR3:%.*]] = inttoptr i128 [[SUNKADDR2]] to ptr
+; NO-GEP-NEXT: store i128 1, ptr [[SUNKADDR3]], align 16
+; NO-GEP-NEXT: ret void
+;
+entry:
+ %cmp = icmp ugt i128 %arg, 10
+ %gep = getelementptr inbounds i8, ptr %ptr, i128 -32
+ %select = select i1 %cmp, ptr %gep, ptr %ptr
+ store i128 1, ptr %select, align 16
+ ret void
+}
+
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
index 663f459..de38752 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-unfold-select.ll
@@ -227,10 +227,6 @@ define i32 @test3(i32 %num) {
; CHECK-NEXT: i32 1, label [[CASE1:%.*]]
; CHECK-NEXT: i32 2, label [[CASE2:%.*]]
; CHECK-NEXT: ]
-; CHECK: for.body.jt4:
-; CHECK-NEXT: [[COUNT_JT4:%.*]] = phi i32 [ [[INC_JT4:%.*]], [[FOR_INC_JT4:%.*]] ]
-; CHECK-NEXT: [[STATE_JT4:%.*]] = phi i32 [ [[STATE_NEXT_JT4:%.*]], [[FOR_INC_JT4]] ]
-; CHECK-NEXT: br label [[FOR_INC_JT1]]
; CHECK: for.body.jt3:
; CHECK-NEXT: [[COUNT_JT3:%.*]] = phi i32 [ [[INC_JT3:%.*]], [[FOR_INC_JT3:%.*]] ]
; CHECK-NEXT: [[STATE_JT3:%.*]] = phi i32 [ [[STATE_NEXT_JT3:%.*]], [[FOR_INC_JT3]] ]
@@ -261,17 +257,14 @@ define i32 @test3(i32 %num) {
; CHECK: sel.2.si.unfold.false:
; CHECK-NEXT: [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 4, [[SEL_2_SI_UNFOLD_TRUE_JT3]] ]
; CHECK-NEXT: br label [[SEL_3_SI_UNFOLD_FALSE]]
-; CHECK: sel.2.si.unfold.false.jt4:
+; CHECK: sel.2.si.unfold.false.jt3:
; CHECK-NEXT: [[DOTSI_UNFOLD_PHI1_JT4:%.*]] = phi i32 [ 4, [[SEL_2_SI_UNFOLD_TRUE:%.*]] ]
-; CHECK-NEXT: br label [[SEL_3_SI_UNFOLD_FALSE_JT4:%.*]]
+; CHECK-NEXT: br label [[SEL_3_SI_UNFOLD_FALSE_JT3]]
; CHECK: sel.3.si.unfold.false:
; CHECK-NEXT: [[SEL_2_SI_UNFOLD_PHI:%.*]] = phi i32 [ poison, [[SEL_2_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SEL_2_SI_UNFOLD_FALSE]] ]
; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: sel.3.si.unfold.false.jt4:
-; CHECK-NEXT: [[SEL_2_SI_UNFOLD_PHI_JT4:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI1_JT4]], [[SEL_2_SI_UNFOLD_FALSE_JT4]] ]
-; CHECK-NEXT: br label [[FOR_INC_JT4]]
; CHECK: sel.3.si.unfold.false.jt3:
-; CHECK-NEXT: [[SEL_2_SI_UNFOLD_PHI_JT3:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI_JT3]], [[SEL_2_SI_UNFOLD_TRUE_JT3]] ]
+; CHECK-NEXT: [[SEL_2_SI_UNFOLD_PHI_JT3:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI_JT3]], [[SEL_2_SI_UNFOLD_TRUE_JT3]] ], [ [[DOTSI_UNFOLD_PHI1_JT4]], [[SEL_2_SI_UNFOLD_FALSE_JT4]] ]
; CHECK-NEXT: br label [[FOR_INC_JT3]]
; CHECK: sel.1.si.unfold.true:
; CHECK-NEXT: br i1 [[CMP_1]], label [[FOR_INC]], label [[SEL_1_SI_UNFOLD_FALSE_JT2:%.*]]
@@ -289,11 +282,6 @@ define i32 @test3(i32 %num) {
; CHECK-NEXT: [[INC]] = add nsw i32 [[COUNT5]], 1
; CHECK-NEXT: [[CMP_EXIT:%.*]] = icmp slt i32 [[INC]], [[NUM:%.*]]
; CHECK-NEXT: br i1 [[CMP_EXIT]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.inc.jt4:
-; CHECK-NEXT: [[STATE_NEXT_JT4]] = phi i32 [ [[SEL_2_SI_UNFOLD_PHI_JT4]], [[SEL_3_SI_UNFOLD_FALSE_JT4]] ]
-; CHECK-NEXT: [[INC_JT4]] = add nsw i32 undef, 1
-; CHECK-NEXT: [[CMP_EXIT_JT4:%.*]] = icmp slt i32 [[INC_JT4]], [[NUM]]
-; CHECK-NEXT: br i1 [[CMP_EXIT_JT4]], label [[FOR_BODY_JT4:%.*]], label [[FOR_END]]
; CHECK: for.inc.jt3:
; CHECK-NEXT: [[STATE_NEXT_JT3]] = phi i32 [ [[SEL_2_SI_UNFOLD_PHI_JT3]], [[SEL_3_SI_UNFOLD_FALSE_JT3]] ]
; CHECK-NEXT: [[INC_JT3]] = add nsw i32 [[COUNT5]], 1
@@ -305,8 +293,8 @@ define i32 @test3(i32 %num) {
; CHECK-NEXT: [[CMP_EXIT_JT2:%.*]] = icmp slt i32 [[INC_JT2]], [[NUM]]
; CHECK-NEXT: br i1 [[CMP_EXIT_JT2]], label [[FOR_BODY_JT2]], label [[FOR_END]]
; CHECK: for.inc.jt1:
-; CHECK-NEXT: [[COUNT4:%.*]] = phi i32 [ [[COUNT_JT4]], [[FOR_BODY_JT4]] ], [ [[COUNT_JT3]], [[FOR_BODY_JT3]] ], [ [[COUNT5]], [[SEL_1_SI_UNFOLD_TRUE_JT1]] ], [ [[COUNT]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[STATE_NEXT_JT1]] = phi i32 [ 1, [[FOR_BODY]] ], [ 1, [[FOR_BODY_JT3]] ], [ 1, [[FOR_BODY_JT4]] ], [ [[DOTSI_UNFOLD_PHI2_JT1]], [[SEL_1_SI_UNFOLD_TRUE_JT1]] ]
+; CHECK-NEXT: [[COUNT4:%.*]] = phi i32 [ [[COUNT_JT3]], [[FOR_BODY_JT3]] ], [ [[COUNT5]], [[SEL_1_SI_UNFOLD_TRUE_JT1]] ], [ [[COUNT]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[STATE_NEXT_JT1]] = phi i32 [ 1, [[FOR_BODY]] ], [ 1, [[FOR_BODY_JT3]] ], [ [[DOTSI_UNFOLD_PHI2_JT1]], [[SEL_1_SI_UNFOLD_TRUE_JT1]] ]
; CHECK-NEXT: [[INC_JT1]] = add nsw i32 [[COUNT4]], 1
; CHECK-NEXT: [[CMP_EXIT_JT1:%.*]] = icmp slt i32 [[INC_JT1]], [[NUM]]
; CHECK-NEXT: br i1 [[CMP_EXIT_JT1]], label [[FOR_BODY_JT1]], label [[FOR_END]]
@@ -402,36 +390,28 @@ define void @pr65222(i32 %flags, i1 %cmp, i1 %tobool.not) {
; CHECK-NEXT: br label [[IF_END_JT2:%.*]]
; CHECK: cond1.si.unfold.true:
; CHECK-NEXT: br i1 [[CMP]], label [[IF_END]], label [[COND1_SI_UNFOLD_FALSE_JT1:%.*]]
-; CHECK: cond1.si.unfold.true.jt3:
+; CHECK: cond1.si.unfold.true.jt2:
; CHECK-NEXT: [[DOTSI_UNFOLD_PHI2:%.*]] = phi i32 [ 3, [[THEN]] ]
-; CHECK-NEXT: br i1 [[CMP]], label [[IF_END_JT3:%.*]], label [[COND1_SI_UNFOLD_FALSE:%.*]]
+; CHECK-NEXT: br i1 [[CMP]], label [[IF_END_JT2]], label [[COND1_SI_UNFOLD_FALSE:%.*]]
; CHECK: cond1.si.unfold.false:
; CHECK-NEXT: [[DOTSI_UNFOLD_PHI3:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE]] ]
; CHECK-NEXT: br label [[IF_END]]
-; CHECK: cond1.si.unfold.false.jt1:
+; CHECK: cond1.si.unfold.false.jt2:
; CHECK-NEXT: [[DOTSI_UNFOLD_PHI3_JT1:%.*]] = phi i32 [ 1, [[COND1_SI_UNFOLD_TRUE1:%.*]] ]
-; CHECK-NEXT: br label [[IF_END_JT1:%.*]]
+; CHECK-NEXT: br label [[IF_END_JT2]]
; CHECK: if.end:
; CHECK-NEXT: [[UNFOLDED:%.*]] = phi i32 [ [[FLAGS:%.*]], [[WHILE_COND]] ], [ [[COND_SI_UNFOLD_PHI]], [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ poison, [[COND1_SI_UNFOLD_TRUE1]] ], [ [[DOTSI_UNFOLD_PHI3]], [[COND1_SI_UNFOLD_FALSE]] ]
; CHECK-NEXT: [[OTHER:%.*]] = phi i32 [ [[FLAGS]], [[WHILE_COND]] ], [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE1]] ], [ 0, [[COND1_SI_UNFOLD_TRUE1]] ], [ 0, [[COND1_SI_UNFOLD_FALSE]] ]
; CHECK-NEXT: switch i32 [[UNFOLDED]], label [[UNREACHABLE:%.*]] [
; CHECK-NEXT: i32 0, label [[SW_BB:%.*]]
; CHECK-NEXT: ]
-; CHECK: if.end.jt1:
-; CHECK-NEXT: [[UNFOLDED_JT1:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI3_JT1]], [[COND1_SI_UNFOLD_FALSE_JT1]] ]
-; CHECK-NEXT: [[OTHER_JT1:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_FALSE_JT1]] ]
-; CHECK-NEXT: br label [[UNREACHABLE]]
-; CHECK: if.end.jt3:
-; CHECK-NEXT: [[UNFOLDED_JT3:%.*]] = phi i32 [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ]
-; CHECK-NEXT: [[OTHER_JT3:%.*]] = phi i32 [ 0, [[COND1_SI_UNFOLD_TRUE]] ]
-; CHECK-NEXT: br label [[UNREACHABLE]]
; CHECK: if.end.jt0:
; CHECK-NEXT: [[UNFOLDED_JT0:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT0]], [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ]
; CHECK-NEXT: [[OTHER_JT0:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE_JT0]] ]
; CHECK-NEXT: br label [[SW_BB]]
; CHECK: if.end.jt2:
-; CHECK-NEXT: [[UNFOLDED_JT2:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT2]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ]
-; CHECK-NEXT: [[OTHER_JT2:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT: [[UNFOLDED_JT2:%.*]] = phi i32 [ [[COND_SI_UNFOLD_PHI_JT2]], [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ [[DOTSI_UNFOLD_PHI2]], [[COND1_SI_UNFOLD_TRUE]] ], [ [[DOTSI_UNFOLD_PHI3_JT1]], [[COND1_SI_UNFOLD_FALSE_JT1]] ]
+; CHECK-NEXT: [[OTHER_JT2:%.*]] = phi i32 [ 0, [[TOUNFOLD_SI_UNFOLD_FALSE]] ], [ 0, [[COND1_SI_UNFOLD_TRUE]] ], [ 0, [[COND1_SI_UNFOLD_FALSE_JT1]] ]
; CHECK-NEXT: br label [[UNREACHABLE]]
; CHECK: unreachable:
; CHECK-NEXT: unreachable
diff --git a/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll b/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
new file mode 100644
index 0000000..4555dfb
--- /dev/null
+++ b/llvm/test/Transforms/DFAJumpThreading/equivalent-states.ll
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=dfa-jump-threading %s | FileCheck %s
+
+declare void @do_something()
+declare void @user(i32)
+
+define void @equivalent_on_default(i1 %c1) {
+; CHECK-LABEL: define void @equivalent_on_default(
+; CHECK-SAME: i1 [[C1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[SWITCH_BB:%.*]]
+; CHECK: switch_bb:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ poison, [[CASE2END:%.*]] ]
+; CHECK-NEXT: switch i32 [[PHI]], label [[DEFAULT_DEST:%.*]] [
+; CHECK-NEXT: i32 0, label [[CASE1:%.*]]
+; CHECK-NEXT: i32 1, label [[CASE2:%.*]]
+; CHECK-NEXT: ]
+; CHECK: switch_bb.jt2:
+; CHECK-NEXT: [[PHI_JT2:%.*]] = phi i32 [ [[PHI_CASE2_JT2:%.*]], [[CASE2END_JT2:%.*]] ]
+; CHECK-NEXT: br label [[DEFAULT_DEST]]
+; CHECK: switch_bb.jt1:
+; CHECK-NEXT: [[PHI_JT1:%.*]] = phi i32 [ 1, [[CASE1]] ]
+; CHECK-NEXT: br label [[CASE2]]
+; CHECK: case1:
+; CHECK-NEXT: br label [[SWITCH_BB_JT1:%.*]]
+; CHECK: case2:
+; CHECK-NEXT: br i1 [[C1]], label [[CASE2THEN:%.*]], label [[CASE2END_JT2]]
+; CHECK: case2then:
+; CHECK-NEXT: br label [[CASE2END_JT2]]
+; CHECK: case2end:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB]]
+; CHECK: case2end.jt2:
+; CHECK-NEXT: [[PHI_CASE2_JT2]] = phi i32 [ 2, [[CASE2]] ], [ 3, [[CASE2THEN]] ]
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB_JT2:%.*]]
+; CHECK: default_dest:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %switch_bb
+
+switch_bb:
+ %phi = phi i32 [ 0, %entry ], [ 1, %case1 ], [ %phi_case2, %case2end ]
+ switch i32 %phi, label %default_dest [
+ i32 0, label %case1
+ i32 1, label %case2
+ ]
+
+case1:
+ br label %switch_bb
+
+case2:
+ br i1 %c1, label %case2then, label %case2end
+
+case2then:
+ br label %case2end
+
+case2end:
+ %phi_case2 = phi i32 [ 2, %case2 ] , [ 3, %case2then ]
+ call void @do_something()
+ br label %switch_bb
+
+default_dest:
+ ret void
+}
+
+define void @equivalent_on_default_user(i1 %c1) {
+; CHECK-LABEL: define void @equivalent_on_default_user(
+; CHECK-SAME: i1 [[C1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[SWITCH_BB:%.*]]
+; CHECK: switch_bb:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ poison, [[CASE2END:%.*]] ]
+; CHECK-NEXT: switch i32 [[PHI]], label [[DEFAULT_DEST:%.*]] [
+; CHECK-NEXT: i32 0, label [[CASE1:%.*]]
+; CHECK-NEXT: i32 1, label [[CASE2:%.*]]
+; CHECK-NEXT: ]
+; CHECK: switch_bb.jt2:
+; CHECK-NEXT: [[PHI_JT2:%.*]] = phi i32 [ [[PHI_CASE2_JT2:%.*]], [[CASE2END_JT2:%.*]] ]
+; CHECK-NEXT: br label [[DEFAULT_DEST]]
+; CHECK: switch_bb.jt1:
+; CHECK-NEXT: [[PHI_JT1:%.*]] = phi i32 [ 1, [[CASE1]] ]
+; CHECK-NEXT: br label [[CASE2]]
+; CHECK: case1:
+; CHECK-NEXT: br label [[SWITCH_BB_JT1:%.*]]
+; CHECK: case2:
+; CHECK-NEXT: br i1 [[C1]], label [[CASE2THEN:%.*]], label [[CASE2END_JT2]]
+; CHECK: case2then:
+; CHECK-NEXT: br label [[CASE2END_JT2]]
+; CHECK: case2end:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @user(i32 poison)
+; CHECK-NEXT: br label [[SWITCH_BB]]
+; CHECK: case2end.jt2:
+; CHECK-NEXT: [[PHI_CASE2_JT2]] = phi i32 [ 2, [[CASE2]] ], [ 3, [[CASE2THEN]] ]
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: call void @user(i32 [[PHI_CASE2_JT2]])
+; CHECK-NEXT: br label [[SWITCH_BB_JT2:%.*]]
+; CHECK: default_dest:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %switch_bb
+
+switch_bb:
+ %phi = phi i32 [ 0, %entry ], [ 1, %case1 ], [ %phi_case2, %case2end ]
+ switch i32 %phi, label %default_dest [
+ i32 0, label %case1
+ i32 1, label %case2
+ ]
+
+case1:
+ br label %switch_bb
+
+case2:
+ br i1 %c1, label %case2then, label %case2end
+
+case2then:
+ br label %case2end
+
+case2end:
+ %phi_case2 = phi i32 [ 2, %case2 ] , [ 3, %case2then ]
+ call void @do_something()
+ call void @user(i32 %phi_case2)
+ br label %switch_bb
+
+default_dest:
+ ret void
+}
+
+define void @equivalent_only_cases(i1 %c1) {
+; CHECK-LABEL: define void @equivalent_only_cases(
+; CHECK-SAME: i1 [[C1:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[SWITCH_BB:%.*]]
+; CHECK: switch_bb:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ poison, [[CASE2END:%.*]] ]
+; CHECK-NEXT: switch i32 [[PHI]], label [[DEFAULT_DEST:%.*]] [
+; CHECK-NEXT: i32 0, label [[CASE1:%.*]]
+; CHECK-NEXT: i32 1, label [[CASE2:%.*]]
+; CHECK-NEXT: i32 2, label [[CASE1]]
+; CHECK-NEXT: i32 3, label [[CASE1]]
+; CHECK-NEXT: ]
+; CHECK: switch_bb.jt2:
+; CHECK-NEXT: [[PHI_JT2:%.*]] = phi i32 [ [[PHI_CASE2_JT2:%.*]], [[CASE2END_JT2:%.*]] ]
+; CHECK-NEXT: br label [[CASE1]]
+; CHECK: switch_bb.jt1:
+; CHECK-NEXT: [[PHI_JT1:%.*]] = phi i32 [ 1, [[CASE1]] ]
+; CHECK-NEXT: br label [[CASE2]]
+; CHECK: case1:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB_JT1:%.*]]
+; CHECK: case2:
+; CHECK-NEXT: br i1 [[C1]], label [[CASE2THEN:%.*]], label [[CASE2END_JT2]]
+; CHECK: case2then:
+; CHECK-NEXT: br label [[CASE2END_JT2]]
+; CHECK: case2end:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB]]
+; CHECK: case2end.jt2:
+; CHECK-NEXT: [[PHI_CASE2_JT2]] = phi i32 [ 2, [[CASE2]] ], [ 3, [[CASE2THEN]] ]
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB_JT2:%.*]]
+; CHECK: default_dest:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %switch_bb
+
+switch_bb:
+ %phi = phi i32 [ 0, %entry ], [ 1, %case1 ], [ %phi_case2, %case2end ]
+ switch i32 %phi, label %default_dest [
+ i32 0, label %case1
+ i32 1, label %case2
+ i32 2, label %case1
+ i32 3, label %case1
+ ]
+
+case1:
+ call void @do_something()
+ br label %switch_bb
+
+case2:
+ br i1 %c1, label %case2then, label %case2end
+
+case2then:
+ br label %case2end
+
+case2end:
+ %phi_case2 = phi i32 [ 2, %case2 ] , [ 3, %case2then ]
+ call void @do_something()
+ br label %switch_bb
+
+default_dest:
+ ret void
+}
+
+define void @equivalent_both_case_and_default(i1 %c1, i1 %c2) {
+; CHECK-LABEL: define void @equivalent_both_case_and_default(
+; CHECK-SAME: i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[SWITCH_BB:%.*]]
+; CHECK: switch_bb:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ poison, [[CASE2END:%.*]] ]
+; CHECK-NEXT: switch i32 [[PHI]], label [[DEFAULT_DEST:%.*]] [
+; CHECK-NEXT: i32 0, label [[CASE1:%.*]]
+; CHECK-NEXT: i32 1, label [[CASE2:%.*]]
+; CHECK-NEXT: i32 2, label [[CASE1]]
+; CHECK-NEXT: i32 3, label [[CASE1]]
+; CHECK-NEXT: ]
+; CHECK: switch_bb.jt4:
+; CHECK-NEXT: [[PHI_JT3:%.*]] = phi i32 [ [[PHI_CASE2_JT3:%.*]], [[CASE2END_JT3:%.*]] ]
+; CHECK-NEXT: br label [[DEFAULT_DEST]]
+; CHECK: switch_bb.jt2:
+; CHECK-NEXT: [[PHI_JT2:%.*]] = phi i32 [ [[PHI_CASE2_JT2:%.*]], [[CASE2END_JT2:%.*]] ]
+; CHECK-NEXT: br label [[CASE1]]
+; CHECK: switch_bb.jt1:
+; CHECK-NEXT: [[PHI_JT1:%.*]] = phi i32 [ 1, [[CASE1]] ]
+; CHECK-NEXT: br label [[CASE2]]
+; CHECK: case1:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB_JT1:%.*]]
+; CHECK: case2:
+; CHECK-NEXT: br i1 [[C1]], label [[CASE2THEN:%.*]], label [[CASE2END_JT2]]
+; CHECK: case2then:
+; CHECK-NEXT: br i1 [[C2]], label [[CASE2THEN2:%.*]], label [[CASE2END_JT2]]
+; CHECK: case2then2:
+; CHECK-NEXT: br i1 [[C2]], label [[CASE2THEN3:%.*]], label [[CASE2END_JT3]]
+; CHECK: case2then3:
+; CHECK-NEXT: br label [[CASE2END_JT3]]
+; CHECK: case2end:
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB]]
+; CHECK: case2end.jt4:
+; CHECK-NEXT: [[PHI_CASE2_JT3]] = phi i32 [ 4, [[CASE2THEN2]] ], [ 5, [[CASE2THEN3]] ]
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB_JT3:%.*]]
+; CHECK: case2end.jt2:
+; CHECK-NEXT: [[PHI_CASE2_JT2]] = phi i32 [ 2, [[CASE2]] ], [ 3, [[CASE2THEN]] ]
+; CHECK-NEXT: call void @do_something()
+; CHECK-NEXT: br label [[SWITCH_BB_JT2:%.*]]
+; CHECK: default_dest:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %switch_bb
+
+switch_bb:
+ %phi = phi i32 [ 0, %entry ], [ 1, %case1 ], [ %phi_case2, %case2end ]
+ switch i32 %phi, label %default_dest [
+ i32 0, label %case1
+ i32 1, label %case2
+ i32 2, label %case1
+ i32 3, label %case1
+ ]
+
+case1:
+ call void @do_something()
+ br label %switch_bb
+
+case2:
+ br i1 %c1, label %case2then, label %case2end
+
+case2then:
+ br i1 %c2, label %case2then2, label %case2end
+
+case2then2:
+ br i1 %c2, label %case2then3, label %case2end
+
+case2then3:
+ br label %case2end
+
+case2end:
+ %phi_case2 = phi i32 [ 2, %case2 ], [ 3, %case2then ], [ 4, %case2then2 ], [ 5, %case2then3 ]
+ call void @do_something()
+ br label %switch_bb
+
+default_dest:
+ ret void
+}
diff --git a/llvm/test/Transforms/GVN/condprop.ll b/llvm/test/Transforms/GVN/condprop.ll
index eb2a9f1..57bd2f3 100644
--- a/llvm/test/Transforms/GVN/condprop.ll
+++ b/llvm/test/Transforms/GVN/condprop.ll
@@ -360,7 +360,7 @@ define i1 @test6_phi2(i1 %c, i32 %x, i32 %y) {
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
; CHECK-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB3:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ [[CMP_NOT]], [[BB1]] ], [ true, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ false, [[BB1]] ], [ true, [[ENTRY:%.*]] ]
; CHECK-NEXT: ret i1 [[PHI]]
; CHECK: bb3:
; CHECK-NEXT: ret i1 false
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 5211fbd..7b0b152 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -10,8 +10,18 @@ target datalayout = "pe1:64:64:64:32"
@g.as1 = external addrspace(1) global i8
@g2.as1 = external addrspace(1) global i8
-define i32 @ptrtoaddr_inttoptr_arg(i32 %a) {
-; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg(
+define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
+; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_arg(
+; CHECK-SAME: i64 [[A:%.*]]) {
+; CHECK-NEXT: ret i64 [[A]]
+;
+ %toptr = inttoptr i64 %a to ptr
+ %toaddr = ptrtoaddr ptr %toptr to i64
+ ret i64 %toaddr
+}
+
+define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
+; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
; CHECK-SAME: i32 [[A:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[A]] to i64
; CHECK-NEXT: [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
@@ -130,3 +140,27 @@ define i32 @ptrtoaddr_sub_consts_offset_addrsize() {
;
ret i32 sub (i32 ptrtoaddr (ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g.as1, i32 42) to i32), i32 ptrtoaddr (ptr addrspace(1) @g.as1 to i32))
}
+
+define i64 @ptrtoaddr_sub_known_offset(ptr %p) {
+; CHECK-LABEL: define i64 @ptrtoaddr_sub_known_offset(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT: ret i64 42
+;
+ %p2 = getelementptr inbounds i8, ptr %p, i64 42
+ %p.addr = ptrtoaddr ptr %p to i64
+ %p2.addr = ptrtoaddr ptr %p2 to i64
+ %sub = sub i64 %p2.addr, %p.addr
+ ret i64 %sub
+}
+
+define i32 @ptrtoaddr_sub_known_offset_addrsize(ptr addrspace(1) %p) {
+; CHECK-LABEL: define i32 @ptrtoaddr_sub_known_offset_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT: ret i32 42
+;
+ %p2 = getelementptr inbounds i8, ptr addrspace(1) %p, i32 42
+ %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+ %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+ %sub = sub i32 %p2.addr, %p.addr
+ ret i32 %sub
+}
diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
index ebea5bf..d88eaf8 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
@@ -1,8 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
declare i1 @gen1()
+;.
+; CHECK: @glb = global i8 0
+;.
define i1 @cond_eq_and(i8 %X, i8 %Y, i8 noundef %C) {
; CHECK-LABEL: @cond_eq_and(
; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[X:%.*]], [[C:%.*]]
@@ -16,16 +19,16 @@ define i1 @cond_eq_and(i8 %X, i8 %Y, i8 noundef %C) {
ret i1 %res
}
-define i1 @cond_eq_and_const(i8 %X, i8 %Y) {
+define i1 @cond_eq_and_const(i8 %X, i8 %Y) !prof !0 {
; CHECK-LABEL: @cond_eq_and_const(
; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[X:%.*]], 10
; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i8 [[Y:%.*]], 10
-; CHECK-NEXT: [[RES:%.*]] = select i1 [[COND]], i1 [[TMP1]], i1 false
+; CHECK-NEXT: [[RES:%.*]] = select i1 [[COND]], i1 [[TMP1]], i1 false, !prof [[PROF1:![0-9]+]]
; CHECK-NEXT: ret i1 [[RES]]
;
%cond = icmp eq i8 %X, 10
%lhs = icmp ult i8 %X, %Y
- %res = select i1 %cond, i1 %lhs, i1 false
+ %res = select i1 %cond, i1 %lhs, i1 false, !prof !1
ret i1 %res
}
@@ -42,16 +45,16 @@ define i1 @cond_eq_or(i8 %X, i8 %Y, i8 noundef %C) {
ret i1 %res
}
-define i1 @cond_eq_or_const(i8 %X, i8 %Y) {
+define i1 @cond_eq_or_const(i8 %X, i8 %Y) !prof !0 {
; CHECK-LABEL: @cond_eq_or_const(
; CHECK-NEXT: [[COND:%.*]] = icmp ne i8 [[X:%.*]], 10
; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i8 [[Y:%.*]], 10
-; CHECK-NEXT: [[RES:%.*]] = select i1 [[COND]], i1 true, i1 [[TMP1]]
+; CHECK-NEXT: [[RES:%.*]] = select i1 [[COND]], i1 true, i1 [[TMP1]], !prof [[PROF1]]
; CHECK-NEXT: ret i1 [[RES]]
;
%cond = icmp ne i8 %X, 10
%lhs = icmp ult i8 %X, %Y
- %res = select i1 %cond, i1 true, i1 %lhs
+ %res = select i1 %cond, i1 true, i1 %lhs, !prof !1
ret i1 %res
}
@@ -793,3 +796,10 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) {
%or = select <2 x i1> %and, <2 x i1> <i1 true, i1 true>, <2 x i1> %implied
ret <2 x i1> %or
}
+
+!0 = !{!"function_entry_count", i64 1000}
+!1 = !{!"branch_weights", i32 2, i32 3}
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
+;.
diff --git a/llvm/test/Transforms/InstCombine/sprintf-1.ll b/llvm/test/Transforms/InstCombine/sprintf-1.ll
index 1d87758..1798514 100644
--- a/llvm/test/Transforms/InstCombine/sprintf-1.ll
+++ b/llvm/test/Transforms/InstCombine/sprintf-1.ll
@@ -5,8 +5,7 @@
; RUN: opt < %s -mtriple xcore-xmos-elf -passes=instcombine -S | FileCheck %s -check-prefixes=CHECK,WITHSTPCPY,CHECK-IPRINTF
; RUN: opt < %s -mtriple=i386-pc-windows-msvc -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY
; RUN: opt < %s -mtriple=i386-mingw32 -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY
-; RUN: opt < %s -mtriple=armv7-none-linux-android16 -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY
-; RUN: opt < %s -mtriple=armv7-none-linux-android21 -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,WITHSTPCPY
+; RUN: opt < %s -mtriple=armv7-none-linux-android -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,WITHSTPCPY
; RUN: opt < %s -mtriple=x86_64-scei-ps4 -passes=instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/llvm/test/Transforms/InstSimplify/ptr_diff.ll b/llvm/test/Transforms/InstSimplify/ptr_diff.ll
index fdd9e8e..508dfbc 100644
--- a/llvm/test/Transforms/InstSimplify/ptr_diff.ll
+++ b/llvm/test/Transforms/InstSimplify/ptr_diff.ll
@@ -14,11 +14,7 @@ define i64 @ptrdiff(ptr %ptr) {
define i64 @ptrdiff_no_inbounds(ptr %ptr) {
; CHECK-LABEL: @ptrdiff_no_inbounds(
-; CHECK-NEXT: [[LAST:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i32 42
-; CHECK-NEXT: [[FIRST_INT:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT: [[LAST_INT:%.*]] = ptrtoint ptr [[LAST]] to i64
-; CHECK-NEXT: [[DIFF:%.*]] = sub i64 [[LAST_INT]], [[FIRST_INT]]
-; CHECK-NEXT: ret i64 [[DIFF]]
+; CHECK-NEXT: ret i64 42
;
%last = getelementptr i8, ptr %ptr, i32 42
%first.int = ptrtoint ptr %ptr to i64
diff --git a/llvm/test/Transforms/SafeStack/X86/abi_ssp.ll b/llvm/test/Transforms/SafeStack/X86/abi_ssp.ll
index e66127533..3b6fb3a 100644
--- a/llvm/test/Transforms/SafeStack/X86/abi_ssp.ll
+++ b/llvm/test/Transforms/SafeStack/X86/abi_ssp.ll
@@ -1,9 +1,7 @@
; RUN: opt -safe-stack -S -mtriple=i686-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
-; RUN: opt -safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,GLOBAL32 %s
-; RUN: opt -safe-stack -S -mtriple=i686-linux-android24 < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
-
+; RUN: opt -safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
; RUN: opt -safe-stack -S -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
; RUN: opt -safe-stack -S -mtriple=x86_64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=COMMON,FUCHSIA64 %s
@@ -11,9 +9,7 @@
; RUN: opt -passes=safe-stack -S -mtriple=i686-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
; RUN: opt -passes=safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
-; RUN: opt -passes=safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,GLOBAL32 %s
-; RUN: opt -passes=safe-stack -S -mtriple=i686-linux-android24 < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
-
+; RUN: opt -passes=safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
; RUN: opt -passes=safe-stack -S -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
; RUN: opt -passes=safe-stack -S -mtriple=x86_64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=COMMON,FUCHSIA64 %s
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
index 0fc3c19..a43e762 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll
@@ -401,4 +401,27 @@ b:
ret i32 %1
}
+define i32 @else_will_be_unreachable(i1 %arg) {
+; CHECK-LABEL: @else_will_be_unreachable(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[I:%.*]] = select i1 [[ARG:%.*]], i32 0, i32 1
+; CHECK-NEXT: ret i32 [[I]]
+;
+entry:
+ switch i1 %arg, label %else [
+ i1 false, label %if
+ i1 true, label %if
+ ]
+
+if:
+ br i1 %arg, label %else, label %bb
+
+bb:
+ br label %else
+
+else:
+ %i = phi i32 [ 0, %entry ], [ 0, %if ], [ 1, %bb ]
+ ret i32 %i
+}
+
declare void @bar(ptr nonnull dereferenceable(4))
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_name.s b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_name.s
new file mode 100644
index 0000000..af58175
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_name.s
@@ -0,0 +1,35 @@
+# Demonstrate dumping DW_AT_language_name.
+# RUN: llvm-mc -triple=x86_64--linux -filetype=obj < %s | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
+
+# CHECK: .debug_abbrev contents:
+# CHECK: DW_AT_language_name DW_FORM_data2
+# CHECK: DW_AT_language_name DW_FORM_data2
+# CHECK: .debug_info contents:
+# CHECK: DW_AT_language_name [DW_FORM_data2] (DW_LNAME_C)
+# CHECK: DW_AT_language_name [DW_FORM_data2] (0x0000)
+
+ .section .debug_abbrev,"",@progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 1 # DW_CHILDREN_no
+ .ascii "\220\001" # DW_AT_language_name
+ .byte 5 # DW_FORM_data2
+ .ascii "\220\001" # DW_AT_language_name
+ .byte 5 # DW_FORM_data2
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+
+ .section .debug_info,"",@progbits
+ .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+ .short 5 # DWARF version number
+ .byte 1 # Unit type
+ .byte 8 # Address Size (in bytes)
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 1 # Abbrev [1] DW_TAG_compile_unit
+ .short 3 # DW_AT_language_name
+ .short 0 # DW_AT_language_name
+ .byte 0
+.Ldebug_info_end0:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s
new file mode 100644
index 0000000..f1be8fd
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/DW_AT_language_version.s
@@ -0,0 +1,35 @@
+# Demonstrate dumping DW_AT_language_version.
+# RUN: llvm-mc -triple=x86_64--linux -filetype=obj < %s | \
+# RUN: llvm-dwarfdump -v - | FileCheck %s
+
+# CHECK: .debug_abbrev contents:
+# CHECK: DW_AT_language_version DW_FORM_data4
+# CHECK: DW_AT_language_version DW_FORM_data2
+# CHECK: .debug_info contents:
+# CHECK: DW_AT_language_version [DW_FORM_data4] (201402)
+# CHECK: DW_AT_language_version [DW_FORM_data2] (0)
+
+ .section .debug_abbrev,"",@progbits
+ .byte 1 # Abbreviation Code
+ .byte 17 # DW_TAG_compile_unit
+ .byte 1 # DW_CHILDREN_no
+ .ascii "\221\001" # DW_AT_language_version
+ .byte 6 # DW_FORM_data4
+ .ascii "\221\001" # DW_AT_language_version
+ .byte 5 # DW_FORM_data2
+ .byte 0 # EOM(1)
+ .byte 0 # EOM(2)
+ .byte 0 # EOM(3)
+
+ .section .debug_info,"",@progbits
+ .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+ .short 5 # DWARF version number
+ .byte 1 # Unit type
+ .byte 8 # Address Size (in bytes)
+ .long .debug_abbrev # Offset Into Abbrev. Section
+ .byte 1 # Abbrev [1] DW_TAG_compile_unit
+ .long 201402 # DW_AT_language_version
+ .short 0 # DW_AT_language_version
+ .byte 0
+.Ldebug_info_end0:
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
index e1e9b57..64e3ed9 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveX390/vector-fp.s
@@ -2323,13 +2323,13 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 3 1.00 U 1 VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI vsetvli zero, zero, e64, m1, tu, mu
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
# CHECK-NEXT: 1 228 228.00 228 VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv v8, v16, v24
# CHECK-NEXT: 1 228 228.00 228 VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf v8, v16, fs0
# CHECK-NEXT: 1 228 228.00 228 VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf v8, v16, fs0
@@ -2352,22 +2352,22 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 228 228.00 228 VLEN1024X300SiFive7VA1[1,229],VLEN1024X300SiFive7VA1OrVA2[1,229],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v v8, v24
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v v8, v24
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v v8, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
+# CHECK-NEXT: 1 19 16.00 19 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v v8, v16
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v v8, v16
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
-# CHECK-NEXT: 1 16 16.00 16 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
+# CHECK-NEXT: 1 23 16.00 23 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 2.00 8 VLEN1024X300SiFive7VA1[1,3],VLEN1024X300SiFive7VA1OrVA2[1,3],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v v8, v16
@@ -2384,13 +2384,13 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 3 1.00 U 1 VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI vsetvli zero, zero, e64, m2, tu, mu
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
# CHECK-NEXT: 1 456 456.00 456 VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv v8, v16, v24
# CHECK-NEXT: 1 456 456.00 456 VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf v8, v16, fs0
# CHECK-NEXT: 1 456 456.00 456 VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf v8, v16, fs0
@@ -2413,22 +2413,22 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 456 456.00 456 VLEN1024X300SiFive7VA1[1,457],VLEN1024X300SiFive7VA1OrVA2[1,457],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v v8, v24
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v v8, v24
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v v8, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
+# CHECK-NEXT: 1 35 32.00 35 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v v8, v16
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v v8, v16
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
-# CHECK-NEXT: 1 32 32.00 32 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
+# CHECK-NEXT: 1 39 32.00 39 VLEN1024X300SiFive7VA1[1,33],VLEN1024X300SiFive7VA1OrVA2[1,33],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 4.00 8 VLEN1024X300SiFive7VA1[1,5],VLEN1024X300SiFive7VA1OrVA2[1,5],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v v8, v16
@@ -2445,13 +2445,13 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 3 1.00 U 1 VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI vsetvli zero, zero, e64, m4, tu, mu
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
# CHECK-NEXT: 1 912 912.00 912 VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv v8, v16, v24
# CHECK-NEXT: 1 912 912.00 912 VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf v8, v16, fs0
# CHECK-NEXT: 1 912 912.00 912 VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf v8, v16, fs0
@@ -2474,22 +2474,22 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 912 912.00 912 VLEN1024X300SiFive7VA1[1,913],VLEN1024X300SiFive7VA1OrVA2[1,913],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v v8, v24
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v v8, v24
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v v8, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
+# CHECK-NEXT: 1 67 64.00 67 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v v8, v16
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
-# CHECK-NEXT: 1 64 64.00 64 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
+# CHECK-NEXT: 1 71 64.00 71 VLEN1024X300SiFive7VA1[1,65],VLEN1024X300SiFive7VA1OrVA2[1,65],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v v8, v16
@@ -2506,13 +2506,13 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_F_F_W vfncvt.f.f.w v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFNCVT_ROD_F_F_W vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 3 1.00 U 1 VLEN1024X300SiFive7PipeA,VLEN1024X300SiFive7PipeAB VSETVLI vsetvli zero, zero, e64, m8, tu, mu
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VV vfadd.vv v8, v16, v24
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFADD_VF vfadd.vf v8, v16, fs0
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VV vfsub.vv v8, v16, v24
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSUB_VF vfsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFRSUB_VF vfrsub.vf v8, v16, fs0
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VV vfmul.vv v8, v16, v24
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMUL_VF vfmul.vf v8, v16, fs0
# CHECK-NEXT: 1 1824 1824.00 1824 VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFDIV_VV vfdiv.vv v8, v16, v24
# CHECK-NEXT: 1 1824 1824.00 1824 VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFDIV_VF vfdiv.vf v8, v16, fs0
# CHECK-NEXT: 1 1824 1824.00 1824 VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFRDIV_VF vfrdiv.vf v8, v16, fs0
@@ -2535,22 +2535,22 @@ vfncvt.rod.f.f.w v8, v16
# CHECK-NEXT: 1 1824 1824.00 1824 VLEN1024X300SiFive7VA1[1,1825],VLEN1024X300SiFive7VA1OrVA2[1,1825],VLEN1024X300SiFive7VCQ VFSQRT_V vfsqrt.v v8, v24
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFRSQRT7_V vfrsqrt7.v v8, v24
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFREC7_V vfrec7.v v8, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VV vfmin.vv v8, v16, v24
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMIN_VF vfmin.vf v8, v16, fs0
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VV vfmax.vv v8, v16, v24
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFMAX_VF vfmax.vf v8, v16, fs0
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VV vfsgnj.vv v8, v16, v24
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJ_VF vfsgnj.vf v8, v16, fs0
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VV vfsgnjn.vv v8, v16, v24
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJN_VF vfsgnjn.vf v8, v16, fs0
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VV vfsgnjx.vv v8, v16, v24
+# CHECK-NEXT: 1 131 128.00 131 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFSGNJX_VF vfsgnjx.vf v8, v16, fs0
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_XU_F_V vfcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_X_F_V vfcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_XU_F_V vfcvt.rtz.xu.f.v v8, v16
# CHECK-NEXT: 1 8 16.00 8 VLEN1024X300SiFive7VA1[1,17],VLEN1024X300SiFive7VA1OrVA2[1,17],VLEN1024X300SiFive7VCQ VFCVT_RTZ_X_F_V vfcvt.rtz.x.f.v v8, v16
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
-# CHECK-NEXT: 1 128 128.00 128 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_XU_V vfcvt.f.xu.v v8, v16
+# CHECK-NEXT: 1 135 128.00 135 VLEN1024X300SiFive7VA1[1,129],VLEN1024X300SiFive7VA1OrVA2[1,129],VLEN1024X300SiFive7VCQ VFCVT_F_X_V vfcvt.f.x.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_XU_F_V vfwcvt.xu.f.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_X_F_V vfwcvt.x.f.v v8, v16
# CHECK-NEXT: 1 8 8.00 8 VLEN1024X300SiFive7VA1[1,9],VLEN1024X300SiFive7VA1OrVA2[1,9],VLEN1024X300SiFive7VCQ VFWCVT_RTZ_XU_F_V vfwcvt.rtz.xu.f.v v8, v16
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 434449c..1031932 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -253,25 +253,17 @@ public:
break;
}
case BasicBlockLevel: {
- const auto &BBVecMap = Emb->getBBVecMap();
for (const BasicBlock &BB : F) {
- auto It = BBVecMap.find(&BB);
- if (It != BBVecMap.end()) {
- OS << BB.getName() << ":";
- It->second.print(OS);
- }
+ OS << BB.getName() << ":";
+ Emb->getBBVector(BB).print(OS);
}
break;
}
case InstructionLevel: {
- const auto &InstMap = Emb->getInstVecMap();
for (const BasicBlock &BB : F) {
for (const Instruction &I : BB) {
- auto It = InstMap.find(&I);
- if (It != InstMap.end()) {
- I.print(OS);
- It->second.print(OS);
- }
+ I.print(OS);
+ Emb->getInstVector(I).print(OS);
}
}
break;
diff --git a/llvm/unittests/ADT/BitsetTest.cpp b/llvm/unittests/ADT/BitsetTest.cpp
new file mode 100644
index 0000000..0ecd213
--- /dev/null
+++ b/llvm/unittests/ADT/BitsetTest.cpp
@@ -0,0 +1,71 @@
+//===- llvm/unittest/Support/BitsetTest.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Bitset.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+template <unsigned NumBits>
+class TestBitsetUInt64Array : public Bitset<NumBits> {
+ static constexpr unsigned NumElts = (NumBits + 63) / 64;
+
+public:
+ TestBitsetUInt64Array(const std::array<uint64_t, NumElts> &B)
+ : Bitset<NumBits>(B) {}
+
+ bool verifyValue(const std::array<uint64_t, NumElts> &B) const {
+ for (unsigned I = 0; I != NumBits; ++I) {
+ bool ReferenceVal =
+ (B[(I / 64)] & (static_cast<uint64_t>(1) << (I % 64))) != 0;
+ if (ReferenceVal != this->test(I))
+ return false;
+ }
+
+ return true;
+ }
+
+ void verifyStorageSize(size_t elements_64_bit, size_t elements_32_bit) {
+ if constexpr (sizeof(uintptr_t) == sizeof(uint64_t))
+ EXPECT_EQ(sizeof(*this), elements_64_bit * sizeof(uintptr_t));
+ else
+ EXPECT_EQ(sizeof(*this), elements_32_bit * sizeof(uintptr_t));
+ }
+};
+
+TEST(BitsetTest, Construction) {
+ std::array<uint64_t, 2> TestVals = {0x123456789abcdef3, 0x1337d3a0b22c24};
+ TestBitsetUInt64Array<96> Test(TestVals);
+ EXPECT_TRUE(Test.verifyValue(TestVals));
+ Test.verifyStorageSize(2, 3);
+
+ TestBitsetUInt64Array<65> Test1(TestVals);
+ EXPECT_TRUE(Test1.verifyValue(TestVals));
+ Test1.verifyStorageSize(2, 3);
+
+ std::array<uint64_t, 1> TestSingleVal = {0x12345678abcdef99};
+
+ TestBitsetUInt64Array<64> Test64(TestSingleVal);
+ EXPECT_TRUE(Test64.verifyValue(TestSingleVal));
+ Test64.verifyStorageSize(1, 2);
+
+ TestBitsetUInt64Array<30> Test30(TestSingleVal);
+ EXPECT_TRUE(Test30.verifyValue(TestSingleVal));
+ Test30.verifyStorageSize(1, 1);
+
+ TestBitsetUInt64Array<32> Test32(TestSingleVal);
+ EXPECT_TRUE(Test32.verifyValue(TestSingleVal));
+ Test32.verifyStorageSize(1, 1);
+
+ TestBitsetUInt64Array<33> Test33(TestSingleVal);
+ EXPECT_TRUE(Test33.verifyValue(TestSingleVal));
+ Test33.verifyStorageSize(1, 2);
+}
+} // namespace
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index dafd735..848ccba 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_unittest(ADTTests
BitFieldsTest.cpp
BitmaskEnumTest.cpp
BitTest.cpp
+ BitsetTest.cpp
BitVectorTest.cpp
BreadthFirstIteratorTest.cpp
BumpPtrListTest.cpp
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 40b4aa2..8ffc5f6 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -30,7 +30,9 @@ namespace {
class TestableEmbedder : public Embedder {
public:
TestableEmbedder(const Function &F, const Vocabulary &V) : Embedder(F, V) {}
- void computeEmbeddings(const BasicBlock &BB) const override {}
+ Embedding computeEmbeddings(const Instruction &I) const override {
+ return Embedding();
+ }
};
TEST(EmbeddingTest, ConstructorsAndAccessors) {
@@ -321,18 +323,12 @@ protected:
}
};
-TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
+TEST_F(IR2VecTestFixture, GetInstVec_Symbolic) {
auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
- const auto &InstMap = Emb->getInstVecMap();
-
- EXPECT_EQ(InstMap.size(), 2u);
- EXPECT_TRUE(InstMap.count(AddInst));
- EXPECT_TRUE(InstMap.count(RetInst));
-
- const auto &AddEmb = InstMap.at(AddInst);
- const auto &RetEmb = InstMap.at(RetInst);
+ const auto &AddEmb = Emb->getInstVector(*AddInst);
+ const auto &RetEmb = Emb->getInstVector(*RetInst);
EXPECT_EQ(AddEmb.size(), 2u);
EXPECT_EQ(RetEmb.size(), 2u);
@@ -340,51 +336,17 @@ TEST_F(IR2VecTestFixture, GetInstVecMap_Symbolic) {
EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 15.5)));
}
-TEST_F(IR2VecTestFixture, GetInstVecMap_FlowAware) {
- auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
- ASSERT_TRUE(static_cast<bool>(Emb));
-
- const auto &InstMap = Emb->getInstVecMap();
-
- EXPECT_EQ(InstMap.size(), 2u);
- EXPECT_TRUE(InstMap.count(AddInst));
- EXPECT_TRUE(InstMap.count(RetInst));
-
- EXPECT_EQ(InstMap.at(AddInst).size(), 2u);
- EXPECT_EQ(InstMap.at(RetInst).size(), 2u);
-
- EXPECT_TRUE(InstMap.at(AddInst).approximatelyEquals(Embedding(2, 25.5)));
- EXPECT_TRUE(InstMap.at(RetInst).approximatelyEquals(Embedding(2, 32.6)));
-}
-
-TEST_F(IR2VecTestFixture, GetBBVecMap_Symbolic) {
- auto Emb = Embedder::create(IR2VecKind::Symbolic, *F, *V);
- ASSERT_TRUE(static_cast<bool>(Emb));
-
- const auto &BBMap = Emb->getBBVecMap();
-
- EXPECT_EQ(BBMap.size(), 1u);
- EXPECT_TRUE(BBMap.count(BB));
- EXPECT_EQ(BBMap.at(BB).size(), 2u);
-
- // BB vector should be sum of add and ret: {25.5, 25.5} + {15.5, 15.5} =
- // {41.0, 41.0}
- EXPECT_TRUE(BBMap.at(BB).approximatelyEquals(Embedding(2, 41.0)));
-}
-
-TEST_F(IR2VecTestFixture, GetBBVecMap_FlowAware) {
+TEST_F(IR2VecTestFixture, GetInstVec_FlowAware) {
auto Emb = Embedder::create(IR2VecKind::FlowAware, *F, *V);
ASSERT_TRUE(static_cast<bool>(Emb));
- const auto &BBMap = Emb->getBBVecMap();
-
- EXPECT_EQ(BBMap.size(), 1u);
- EXPECT_TRUE(BBMap.count(BB));
- EXPECT_EQ(BBMap.at(BB).size(), 2u);
+ const auto &AddEmb = Emb->getInstVector(*AddInst);
+ const auto &RetEmb = Emb->getInstVector(*RetInst);
+ EXPECT_EQ(AddEmb.size(), 2u);
+ EXPECT_EQ(RetEmb.size(), 2u);
- // BB vector should be sum of add and ret: {25.5, 25.5} + {32.6, 32.6} =
- // {58.1, 58.1}
- EXPECT_TRUE(BBMap.at(BB).approximatelyEquals(Embedding(2, 58.1)));
+ EXPECT_TRUE(AddEmb.approximatelyEquals(Embedding(2, 25.5)));
+ EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 32.6)));
}
TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
@@ -394,6 +356,8 @@ TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
const auto &BBVec = Emb->getBBVector(*BB);
EXPECT_EQ(BBVec.size(), 2u);
+ // BB vector should be sum of add and ret: {25.5, 25.5} + {15.5, 15.5} =
+ // {41.0, 41.0}
EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 41.0)));
}
@@ -404,6 +368,8 @@ TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
const auto &BBVec = Emb->getBBVector(*BB);
EXPECT_EQ(BBVec.size(), 2u);
+ // BB vector should be sum of add and ret: {25.5, 25.5} + {32.6, 32.6} =
+ // {58.1, 58.1}
EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 58.1)));
}
@@ -446,15 +412,9 @@ TEST_F(IR2VecTestFixture, MultipleComputeEmbeddingsConsistency_Symbolic) {
EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec3));
EXPECT_TRUE(FuncVec2.approximatelyEquals(FuncVec3));
- // Also check that instruction vectors remain consistent
- const auto &InstMap1 = Emb->getInstVecMap();
- const auto &InstMap2 = Emb->getInstVecMap();
-
- EXPECT_EQ(InstMap1.size(), InstMap2.size());
- for (const auto &[Inst, Vec1] : InstMap1) {
- ASSERT_TRUE(InstMap2.count(Inst));
- EXPECT_TRUE(Vec1.approximatelyEquals(InstMap2.at(Inst)));
- }
+ Emb->invalidateEmbeddings();
+ const auto &FuncVec4 = Emb->getFunctionVector();
+ EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec4));
}
TEST_F(IR2VecTestFixture, MultipleComputeEmbeddingsConsistency_FlowAware) {
@@ -473,15 +433,9 @@ TEST_F(IR2VecTestFixture, MultipleComputeEmbeddingsConsistency_FlowAware) {
EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec3));
EXPECT_TRUE(FuncVec2.approximatelyEquals(FuncVec3));
- // Also check that instruction vectors remain consistent
- const auto &InstMap1 = Emb->getInstVecMap();
- const auto &InstMap2 = Emb->getInstVecMap();
-
- EXPECT_EQ(InstMap1.size(), InstMap2.size());
- for (const auto &[Inst, Vec1] : InstMap1) {
- ASSERT_TRUE(InstMap2.count(Inst));
- EXPECT_TRUE(Vec1.approximatelyEquals(InstMap2.at(Inst)));
- }
+ Emb->invalidateEmbeddings();
+ const auto &FuncVec4 = Emb->getFunctionVector();
+ EXPECT_TRUE(FuncVec1.approximatelyEquals(FuncVec4));
}
static constexpr unsigned MaxOpcodes = Vocabulary::MaxOpcodes;
diff --git a/llvm/unittests/IR/ConstantFPRangeTest.cpp b/llvm/unittests/IR/ConstantFPRangeTest.cpp
index 255f62d..1436f0f 100644
--- a/llvm/unittests/IR/ConstantFPRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantFPRangeTest.cpp
@@ -767,4 +767,39 @@ TEST_F(ConstantFPRangeTest, makeExactFCmpRegion) {
}
}
+TEST_F(ConstantFPRangeTest, abs) {
+ EXPECT_EQ(Full.abs(),
+ ConstantFPRange(APFloat::getZero(Sem, /*Negative=*/false),
+ APFloat::getInf(Sem, /*Negative=*/false),
+ /*MayBeQNaN=*/true,
+ /*MayBeSNaN=*/true));
+ EXPECT_EQ(Empty.abs(), Empty);
+ EXPECT_EQ(Zero.abs(), PosZero);
+ EXPECT_EQ(PosInf.abs(), PosInf);
+ EXPECT_EQ(NegInf.abs(), PosInf);
+ EXPECT_EQ(Some.abs(), SomePos);
+ EXPECT_EQ(SomeNeg.abs(), SomePos);
+ EXPECT_EQ(NaN.abs(), NaN);
+ EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat(-2.0), APFloat(3.0)).abs(),
+ ConstantFPRange::getNonNaN(APFloat(0.0), APFloat(3.0)));
+ EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat(-3.0), APFloat(2.0)).abs(),
+ ConstantFPRange::getNonNaN(APFloat(0.0), APFloat(3.0)));
+}
+
+TEST_F(ConstantFPRangeTest, negate) {
+ EXPECT_EQ(Full.negate(), Full);
+ EXPECT_EQ(Empty.negate(), Empty);
+ EXPECT_EQ(Zero.negate(), Zero);
+ EXPECT_EQ(PosInf.negate(), NegInf);
+ EXPECT_EQ(NegInf.negate(), PosInf);
+ EXPECT_EQ(Some.negate(), Some);
+ EXPECT_EQ(SomePos.negate(), SomeNeg);
+ EXPECT_EQ(SomeNeg.negate(), SomePos);
+ EXPECT_EQ(NaN.negate(), NaN);
+ EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat(-2.0), APFloat(3.0)).negate(),
+ ConstantFPRange::getNonNaN(APFloat(-3.0), APFloat(2.0)));
+ EXPECT_EQ(ConstantFPRange::getNonNaN(APFloat(-3.0), APFloat(2.0)).negate(),
+ ConstantFPRange::getNonNaN(APFloat(-2.0), APFloat(3.0)));
+}
+
} // anonymous namespace
diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
index 10f0213..fd8ddb1 100644
--- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp
@@ -588,7 +588,12 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
PredicateSorter.insert(
PredicateWithCC()); // No predicate or CC override first.
+ constexpr unsigned BitsPerStorageElt = 64;
DenseMap<PredicateWithCC, LibcallsWithCC> Pred2Funcs;
+
+ SmallVector<uint64_t, 32> BitsetValues(
+ divideCeil(RuntimeLibcallImplDefList.size(), BitsPerStorageElt));
+
for (const Record *Elt : *Elements) {
const RuntimeLibcallImpl *LibCallImpl = getRuntimeLibcallImpl(Elt);
if (!LibCallImpl) {
@@ -597,16 +602,24 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
continue;
}
+ size_t BitIdx = LibCallImpl->getEnumVal();
+ uint64_t BitmaskVal = uint64_t(1) << (BitIdx % BitsPerStorageElt);
+ size_t BitsetIdx = BitIdx / BitsPerStorageElt;
+
auto It = Func2Preds.find(LibCallImpl);
if (It == Func2Preds.end()) {
+ BitsetValues[BitsetIdx] |= BitmaskVal;
Pred2Funcs[PredicateWithCC()].LibcallImpls.push_back(LibCallImpl);
continue;
}
for (const Record *Pred : It->second.first) {
const Record *CC = It->second.second;
- PredicateWithCC Key(Pred, CC);
+ AvailabilityPredicate SubsetPredicate(Pred);
+ if (SubsetPredicate.isAlwaysAvailable())
+ BitsetValues[BitsetIdx] |= BitmaskVal;
+ PredicateWithCC Key(Pred, CC);
auto &Entry = Pred2Funcs[Key];
Entry.LibcallImpls.push_back(LibCallImpl);
Entry.CallingConv = It->second.second;
@@ -614,6 +627,22 @@ void RuntimeLibcallEmitter::emitSystemRuntimeLibrarySetCalls(
}
}
+ OS << " static constexpr LibcallImplBitset SystemAvailableImpls({\n"
+ << indent(6);
+
+ ListSeparator LS;
+ unsigned EntryCount = 0;
+ for (uint64_t Bits : BitsetValues) {
+ if (EntryCount++ == 4) {
+ EntryCount = 1;
+ OS << ",\n" << indent(6);
+ } else
+ OS << LS;
+ OS << format_hex(Bits, 16);
+ }
+ OS << "\n });\n"
+ " AvailableLibcallImpls = SystemAvailableImpls;\n\n";
+
SmallVector<PredicateWithCC, 0> SortedPredicates =
PredicateSorter.takeVector();
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
index 5f9eb9a..eb8aef2 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
@@ -27,8 +27,6 @@ static_library("Analysis") {
"FixitUtil.cpp",
"IntervalPartition.cpp",
"IssueHash.cpp",
- "LifetimeAnnotations.cpp",
- "LifetimeSafety.cpp",
"LiveVariables.cpp",
"MacroExpansionContext.cpp",
"ObjCNoReturn.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
new file mode 100644
index 0000000..a148e78
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
@@ -0,0 +1,16 @@
+static_library("LifetimeSafety") {
+ output_name = "clangAnalysisLifetimeSafety"
+ configs += [ "//llvm/utils/gn/build:clang_code" ]
+ deps = [ "//clang/lib/Basic" ]
+ sources = [
+ "Checker.cpp",
+ "Facts.cpp",
+ "FactsGenerator.cpp",
+ "LifetimeAnnotations.cpp",
+ "LifetimeSafety.cpp",
+ "LiveOrigins.cpp",
+ "Loans.cpp",
+ "LoanPropagation.cpp",
+ "Origins.cpp",
+ ]
+}
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index 96ff481..14f44c4 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -29,6 +29,7 @@ static_library("Sema") {
"//clang/lib/APINotes",
"//clang/lib/AST",
"//clang/lib/Analysis",
+ "//clang/lib/Analysis/LifetimeSafety",
"//clang/lib/Basic",
"//clang/lib/Edit",
"//clang/lib/Lex",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
index 92e596e..8d19d30 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
@@ -21,6 +21,7 @@ unittest("ADTTests") {
"BitTest.cpp",
"BitVectorTest.cpp",
"BitmaskEnumTest.cpp",
+ "BitsetTest.cpp",
"BreadthFirstIteratorTest.cpp",
"BumpPtrListTest.cpp",
"CoalescingBitVectorTest.cpp",