aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/docs/CodingStandards.rst36
-rw-r--r--llvm/docs/CommandGuide/llc.rst7
-rw-r--r--llvm/docs/CommandGuide/lli.rst5
-rw-r--r--llvm/docs/GlobalISel/GenericOpcode.rst2
-rw-r--r--llvm/docs/SourceLevelDebugging.rst2
-rw-r--r--llvm/include/llvm/ADT/BitmaskEnum.h4
-rw-r--r--llvm/include/llvm/Analysis/ScalarEvolution.h8
-rw-r--r--llvm/include/llvm/AsmParser/AsmParserContext.h70
-rw-r--r--llvm/include/llvm/AsmParser/FileLoc.h56
-rw-r--r--llvm/include/llvm/AsmParser/LLLexer.h24
-rw-r--r--llvm/include/llvm/AsmParser/LLParser.h11
-rw-r--r--llvm/include/llvm/AsmParser/Parser.h16
-rw-r--r--llvm/include/llvm/CodeGen/BasicTTIImpl.h24
-rw-r--r--llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h2
-rw-r--r--llvm/include/llvm/CodeGen/DebugHandlerBase.h2
-rw-r--r--llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h9
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h6
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/Combiner.h2
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h2
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h2
-rw-r--r--llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h2
-rw-r--r--llvm/include/llvm/CodeGen/MIR2Vec.h138
-rw-r--r--llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h2
-rw-r--r--llvm/include/llvm/CodeGen/MachineOutliner.h2
-rw-r--r--llvm/include/llvm/CodeGen/ResourcePriorityQueue.h2
-rw-r--r--llvm/include/llvm/CodeGen/SDPatternMatch.h5
-rw-r--r--llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h1
-rw-r--r--llvm/include/llvm/CodeGen/TargetRegisterInfo.h2
-rw-r--r--llvm/include/llvm/CodeGen/VLIWMachineScheduler.h2
-rw-r--r--llvm/include/llvm/IR/ModuleSummaryIndex.h6
-rw-r--r--llvm/include/llvm/IRReader/IRReader.h17
-rw-r--r--llvm/include/llvm/Support/AllocToken.h68
-rw-r--r--llvm/include/llvm/Transforms/Instrumentation/AllocToken.h2
-rw-r--r--llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h5
-rw-r--r--llvm/lib/Analysis/DXILResource.cpp8
-rw-r--r--llvm/lib/Analysis/InstructionSimplify.cpp92
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp17
-rw-r--r--llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json70
-rw-r--r--llvm/lib/AsmParser/AsmParserContext.cpp89
-rw-r--r--llvm/lib/AsmParser/CMakeLists.txt1
-rw-r--r--llvm/lib/AsmParser/LLLexer.cpp2
-rw-r--r--llvm/lib/AsmParser/LLParser.cpp24
-rw-r--r--llvm/lib/AsmParser/Parser.cpp31
-rw-r--r--llvm/lib/Bitcode/Reader/BitcodeReader.cpp2
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h2
-rw-r--r--llvm/lib/CodeGen/GlobalISel/Combiner.cpp4
-rw-r--r--llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp2
-rw-r--r--llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp2
-rw-r--r--llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp2
-rw-r--r--llvm/lib/CodeGen/MIR2Vec.cpp335
-rw-r--r--llvm/lib/CodeGen/MachineBasicBlock.cpp2
-rw-r--r--llvm/lib/CodeGen/PeepholeOptimizer.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp45
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp7
-rw-r--r--llvm/lib/CodeGen/ShrinkWrap.cpp2
-rw-r--r--llvm/lib/IR/AutoUpgrade.cpp30
-rw-r--r--llvm/lib/IR/ModuleSummaryIndex.cpp8
-rw-r--r--llvm/lib/IRReader/IRReader.cpp13
-rw-r--r--llvm/lib/LTO/LTO.cpp2
-rw-r--r--llvm/lib/Passes/PassBuilder.cpp25
-rw-r--r--llvm/lib/Passes/PassRegistry.def5
-rw-r--r--llvm/lib/Support/AllocToken.cpp61
-rw-r--r--llvm/lib/Support/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrAtomics.td12
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrGISel.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp69
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h46
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp19
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp4
-rw-r--r--llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp14
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td17
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td2
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver4.td110
-rw-r--r--llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp12
-rw-r--r--llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp78
-rw-r--r--llvm/lib/Transforms/Instrumentation/AllocToken.cpp75
-rw-r--r--llvm/lib/Transforms/Utils/BasicBlockUtils.cpp10
-rw-r--r--llvm/lib/Transforms/Utils/PredicateInfo.cpp13
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanValue.h10
-rw-r--r--llvm/test/Analysis/BasicAA/matrix-intrinsics.ll30
-rw-r--r--llvm/test/Assembler/autoupgrade-invalid-masked-align.ll49
-rw-r--r--llvm/test/Bitcode/thinlto-deadstrip-flag.ll20
-rw-r--r--llvm/test/Bitcode/thinlto-index-flags.ll39
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir2
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir2
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir2
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-ld1.ll26
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-rounding.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/consthoist-gep.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/load-zext-bitcast.ll525
-rw-r--r--llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/recp-fastmath.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir76
-rw-r--r--llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/stack_guard_remat.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/wineh-frame5.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/wineh-frame6.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/wineh-frame7.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/wineh-frame8.mir2
-rw-r--r--llvm/test/CodeGen/AArch64/wineh5.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/addsub64_carry.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll1260
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll135
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll210
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll585
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll614
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop16.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll25
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll35
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptrunc.ll36
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll81
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll115
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll128
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.mir82
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_cmp_0.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem.ll654
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll207
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll80
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll190
-rw-r--r--llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll8
-rw-r--r--llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll59
-rw-r--r--llvm/test/CodeGen/BPF/BTF/ptr-named.ll75
-rw-r--r--llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll9
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json18
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json18
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json11
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json11
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt291
-rw-r--r--llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt291
-rw-r--r--llvm/test/CodeGen/MIR2Vec/if-else.mir8
-rw-r--r--llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir22
-rw-r--r--llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll6
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll6
-rw-r--r--llvm/test/CodeGen/WebAssembly/memory-interleave.ll1608
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll60
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll59
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll39
-rw-r--r--llvm/test/CodeGen/X86/bf16-fast-isel.ll66
-rw-r--r--llvm/test/CodeGen/X86/fp128-select.ll6
-rw-r--r--llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll49
-rw-r--r--llvm/test/Instrumentation/AllocToken/basic.ll2
-rw-r--r--llvm/test/Instrumentation/AllocToken/basic32.ll2
-rw-r--r--llvm/test/Instrumentation/AllocToken/fast.ll2
-rw-r--r--llvm/test/Instrumentation/AllocToken/intrinsic.ll2
-rw-r--r--llvm/test/Instrumentation/AllocToken/intrinsic32.ll2
-rw-r--r--llvm/test/Instrumentation/AllocToken/invoke.ll2
-rw-r--r--llvm/test/Instrumentation/AllocToken/nonlibcalls.ll2
-rw-r--r--llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll2
-rw-r--r--llvm/test/LTO/AArch64/Inputs/bar.ll (renamed from llvm/test/LTO/AArch64/TestInputs/bar.ll)0
-rw-r--r--llvm/test/LTO/AArch64/Inputs/fiz.ll (renamed from llvm/test/LTO/AArch64/TestInputs/fiz.ll)0
-rw-r--r--llvm/test/LTO/AArch64/Inputs/foo.ll (renamed from llvm/test/LTO/AArch64/TestInputs/foo.ll)0
-rw-r--r--llvm/test/LTO/AArch64/Inputs/old.ll (renamed from llvm/test/LTO/AArch64/TestInputs/old.ll)0
-rw-r--r--llvm/test/LTO/AArch64/link-branch-target-enforcement.ll2
-rw-r--r--llvm/test/LTO/AArch64/link-sign-return-address.ll8
-rw-r--r--llvm/test/MC/AMDGPU/literals.s1624
-rw-r--r--llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll338
-rw-r--r--llvm/test/Transforms/GVN/matrix-intrinsics.ll136
-rw-r--r--llvm/test/Transforms/InstCombine/select_with_identical_phi.ll243
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll14
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll172
-rw-r--r--llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll1000
-rw-r--r--llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll30
-rw-r--r--llvm/test/Transforms/SCCP/conditions-ranges.ll25
-rw-r--r--llvm/test/Transforms/Util/PredicateInfo/testandor.ll27
-rw-r--r--llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll132
-rw-r--r--llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll7
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s18
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s26
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s4
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s14
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s10
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s18
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s16
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s18
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s10
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s8
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s10
-rw-r--r--llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s98
-rw-r--r--llvm/unittests/AsmParser/AsmParserTest.cpp55
-rw-r--r--llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp16
-rw-r--r--llvm/unittests/CodeGen/InstrRefLDVTest.cpp2
-rw-r--r--llvm/unittests/CodeGen/MIR2VecTest.cpp369
-rw-r--r--llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp25
-rw-r--r--llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp29
-rw-r--r--llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn1
-rwxr-xr-xllvm/utils/update_mc_test_checks.py28
223 files changed, 10514 insertions, 5030 deletions
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index 8677d89..63f6663 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -1692,29 +1692,29 @@ faraway places in the file to tell that the function is local:
Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-When writing the body of an ``if``, ``else``, or for/while loop statement, we
-prefer to omit the braces to avoid unnecessary line noise. However, braces
-should be used in cases where the omission of braces harms the readability and
-maintainability of the code.
+When writing the body of an ``if``, ``else``, or ``for``/``while`` loop
+statement, we aim to reduce unnecessary line noise.
-We consider that readability is harmed when omitting the brace in the presence
-of a single statement that is accompanied by a comment (assuming the comment
-can't be hoisted above the ``if`` or loop statement, see below).
+**Omit braces when:**
-Similarly, braces should be used when a single-statement body is complex enough
-that it becomes difficult to see where the block containing the following
-statement began. An ``if``/``else`` chain or a loop is considered a single
-statement for this rule, and this rule applies recursively.
+* The body consists of a single **simple** statement.
+* The single statement is not preceded by a comment.
+ (Hoist comments above the control statement if you can.)
+* An ``else`` clause, if present, also meets the above criteria (single
+ simple statement, no associated comments).
-This list is not exhaustive. For example, readability is also harmed if an
-``if``/``else`` chain does not use braced bodies for either all or none of its
-members, or has complex conditionals, deep nesting, etc. The examples below
-intend to provide some guidelines.
+**Use braces in all other cases, including:**
-Maintainability is harmed if the body of an ``if`` ends with a (directly or
-indirectly) nested ``if`` statement with no ``else``. Braces on the outer ``if``
-would help to avoid running into a "dangling else" situation.
+* Multi-statement bodies
+* Single-statement bodies with non-hoistable comments
+* Complex single-statement bodies (e.g., deep nesting, complex nested
+ loops)
+* Inconsistent bracing within ``if``/``else if``/``else`` chains (if one
+ block requires braces, all must)
+* ``if`` statements ending with a nested ``if`` lacking an ``else`` (to
+ prevent "dangling else")
+The examples below provide guidelines for these cases:
.. code-block:: c++
diff --git a/llvm/docs/CommandGuide/llc.rst b/llvm/docs/CommandGuide/llc.rst
index 900649f..cc670f6 100644
--- a/llvm/docs/CommandGuide/llc.rst
+++ b/llvm/docs/CommandGuide/llc.rst
@@ -125,13 +125,6 @@ End-user Options
Enable setting the FP exceptions build attribute not to use exceptions.
-.. option:: --enable-unsafe-fp-math
-
- Enable optimizations that make unsafe assumptions about IEEE math (e.g. that
- addition is associative) or may not work for all input ranges. These
- optimizations allow the code generator to make use of some instructions which
- would otherwise not be usable (such as ``fsin`` on X86).
-
.. option:: --stats
Print statistics recorded by code-generation passes.
diff --git a/llvm/docs/CommandGuide/lli.rst b/llvm/docs/CommandGuide/lli.rst
index 94c0013..8afe10d 100644
--- a/llvm/docs/CommandGuide/lli.rst
+++ b/llvm/docs/CommandGuide/lli.rst
@@ -107,11 +107,6 @@ FLOATING POINT OPTIONS
Enable optimizations that assume no NAN values.
-.. option:: -enable-unsafe-fp-math
-
- Causes :program:`lli` to enable optimizations that may decrease floating point
- precision.
-
.. option:: -soft-float
Causes :program:`lli` to generate software floating point library calls instead of
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index b055327..661a115 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -504,7 +504,7 @@ undefined.
G_ABDS, G_ABDU
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Compute the absolute difference (signed and unsigned), e.g. abs(x-y).
+Compute the absolute difference (signed and unsigned), e.g. trunc(abs(ext(x)-ext(y)).
.. code-block:: none
diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst
index f057b2d..12b5e3e 100644
--- a/llvm/docs/SourceLevelDebugging.rst
+++ b/llvm/docs/SourceLevelDebugging.rst
@@ -674,7 +674,7 @@ Compiled to LLVM, this function would be represented like this:
ret void, !dbg !24
}
- attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index d464cbc..9555fad 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -106,7 +106,7 @@ struct is_bitmask_enum : std::false_type {};
template <typename E>
struct is_bitmask_enum<
- E, std::enable_if_t<sizeof(E::LLVM_BITMASK_LARGEST_ENUMERATOR) >= 0>>
+ E, std::void_t<decltype(E::LLVM_BITMASK_LARGEST_ENUMERATOR)>>
: std::true_type {};
/// Trait class to determine bitmask enumeration largest bit.
@@ -114,7 +114,7 @@ template <typename E, typename Enable = void> struct largest_bitmask_enum_bit;
template <typename E>
struct largest_bitmask_enum_bit<
- E, std::enable_if_t<sizeof(E::LLVM_BITMASK_LARGEST_ENUMERATOR) >= 0>> {
+ E, std::void_t<decltype(E::LLVM_BITMASK_LARGEST_ENUMERATOR)>> {
using UnderlyingTy = std::underlying_type_t<E>;
static constexpr UnderlyingTy value =
static_cast<UnderlyingTy>(E::LLVM_BITMASK_LARGEST_ENUMERATOR);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 3d3ec14..04ea769 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -638,8 +638,12 @@ public:
/// \p GEP The GEP. The indices contained in the GEP itself are ignored,
/// instead we use IndexExprs.
/// \p IndexExprs The expressions for the indices.
- LLVM_ABI const SCEV *
- getGEPExpr(GEPOperator *GEP, const SmallVectorImpl<const SCEV *> &IndexExprs);
+ LLVM_ABI const SCEV *getGEPExpr(GEPOperator *GEP,
+ ArrayRef<const SCEV *> IndexExprs);
+ LLVM_ABI const SCEV *getGEPExpr(const SCEV *BaseExpr,
+ ArrayRef<const SCEV *> IndexExprs,
+ Type *SrcElementTy,
+ GEPNoWrapFlags NW = GEPNoWrapFlags::none());
LLVM_ABI const SCEV *getAbsExpr(const SCEV *Op, bool IsNSW);
LLVM_ABI const SCEV *getMinMaxExpr(SCEVTypes Kind,
SmallVectorImpl<const SCEV *> &Operands);
diff --git a/llvm/include/llvm/AsmParser/AsmParserContext.h b/llvm/include/llvm/AsmParser/AsmParserContext.h
new file mode 100644
index 0000000..1a397486
--- /dev/null
+++ b/llvm/include/llvm/AsmParser/AsmParserContext.h
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_ASMPARSERCONTEXT_H
+#define LLVM_ASMPARSER_ASMPARSERCONTEXT_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/AsmParser/FileLoc.h"
+#include "llvm/IR/Value.h"
+#include <optional>
+
+namespace llvm {
+
+/// Registry of file location information for LLVM IR constructs.
+///
+/// This class provides access to the file location information
+/// for various LLVM IR constructs. Currently, it supports Function,
+/// BasicBlock and Instruction locations.
+///
+/// When available, it can answer queries about what is at a given
+/// file location, as well as where in a file a given IR construct
+/// is.
+///
+/// This information is optionally emitted by the LLParser while
+/// it reads LLVM textual IR.
+class AsmParserContext {
+ DenseMap<Function *, FileLocRange> Functions;
+ DenseMap<BasicBlock *, FileLocRange> Blocks;
+ DenseMap<Instruction *, FileLocRange> Instructions;
+
+public:
+ std::optional<FileLocRange> getFunctionLocation(const Function *) const;
+ std::optional<FileLocRange> getBlockLocation(const BasicBlock *) const;
+ std::optional<FileLocRange> getInstructionLocation(const Instruction *) const;
+ /// Get the function at the requested location range.
+ /// If no single function occupies the queried range, or the record is
+ /// missing, a nullptr is returned.
+ Function *getFunctionAtLocation(const FileLocRange &) const;
+ /// Get the function at the requested location.
+ /// If no function occupies the queried location, or the record is missing, a
+ /// nullptr is returned.
+ Function *getFunctionAtLocation(const FileLoc &) const;
+ /// Get the block at the requested location range.
+ /// If no single block occupies the queried range, or the record is missing, a
+ /// nullptr is returned.
+ BasicBlock *getBlockAtLocation(const FileLocRange &) const;
+ /// Get the block at the requested location.
+ /// If no block occupies the queried location, or the record is missing, a
+ /// nullptr is returned.
+ BasicBlock *getBlockAtLocation(const FileLoc &) const;
+ /// Get the instruction at the requested location range.
+ /// If no single instruction occupies the queried range, or the record is
+ /// missing, a nullptr is returned.
+ Instruction *getInstructionAtLocation(const FileLocRange &) const;
+ /// Get the instruction at the requested location.
+ /// If no instruction occupies the queried location, or the record is missing,
+ /// a nullptr is returned.
+ Instruction *getInstructionAtLocation(const FileLoc &) const;
+ bool addFunctionLocation(Function *, const FileLocRange &);
+ bool addBlockLocation(BasicBlock *, const FileLocRange &);
+ bool addInstructionLocation(Instruction *, const FileLocRange &);
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/AsmParser/FileLoc.h b/llvm/include/llvm/AsmParser/FileLoc.h
new file mode 100644
index 0000000..02c1849
--- /dev/null
+++ b/llvm/include/llvm/AsmParser/FileLoc.h
@@ -0,0 +1,56 @@
+//===-- FileLoc.h ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_FILELOC_H
+#define LLVM_ASMPARSER_FILELOC_H
+
+#include <cassert>
+#include <utility>
+
+namespace llvm {
+
+/// Struct holding Line:Column location
+struct FileLoc {
+ /// 0-based line number
+ unsigned Line;
+ /// 0-based column number
+ unsigned Col;
+
+ bool operator<=(const FileLoc &RHS) const {
+ return Line < RHS.Line || (Line == RHS.Line && Col <= RHS.Col);
+ }
+
+ bool operator<(const FileLoc &RHS) const {
+ return Line < RHS.Line || (Line == RHS.Line && Col < RHS.Col);
+ }
+
+ FileLoc(unsigned L, unsigned C) : Line(L), Col(C) {}
+ FileLoc(std::pair<unsigned, unsigned> LC) : Line(LC.first), Col(LC.second) {}
+};
+
+/// Struct holding a semiopen range [Start; End)
+struct FileLocRange {
+ FileLoc Start;
+ FileLoc End;
+
+ FileLocRange() : Start(0, 0), End(0, 0) {}
+
+ FileLocRange(FileLoc S, FileLoc E) : Start(S), End(E) {
+ assert(Start <= End);
+ }
+
+ bool contains(FileLoc L) const { return Start <= L && L < End; }
+
+ bool contains(FileLocRange LR) const {
+ return Start <= LR.Start && LR.End <= End;
+ }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 501a7ae..0e379e5 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -13,22 +13,25 @@
#ifndef LLVM_ASMPARSER_LLLEXER_H
#define LLVM_ASMPARSER_LLLEXER_H
-#include "LLToken.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APSInt.h"
+#include "llvm/AsmParser/LLToken.h"
#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
#include <string>
namespace llvm {
class Type;
class SMDiagnostic;
- class SourceMgr;
class LLVMContext;
class LLLexer {
const char *CurPtr;
StringRef CurBuf;
+ /// The end (exclusive) of the previous token.
+ const char *PrevTokEnd = nullptr;
+
enum class ErrorPriority {
None, // No error message present.
Parser, // Errors issued by parser.
@@ -62,9 +65,7 @@ namespace llvm {
explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &,
LLVMContext &C);
- lltok::Kind Lex() {
- return CurKind = LexToken();
- }
+ lltok::Kind Lex() { return CurKind = LexToken(); }
typedef SMLoc LocTy;
LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); }
@@ -79,6 +80,19 @@ namespace llvm {
IgnoreColonInIdentifiers = val;
}
+ /// Get the line, column position of the start of the current token,
+ /// zero-indexed
+ std::pair<unsigned, unsigned> getTokLineColumnPos() {
+ auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(TokStart));
+ return {LC.first - 1, LC.second - 1};
+ }
+ /// Get the line, column position of the end of the previous token,
+ /// zero-indexed exclusive
+ std::pair<unsigned, unsigned> getPrevTokEndLineColumnPos() {
+ auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(PrevTokEnd));
+ return {LC.first - 1, LC.second - 1};
+ }
+
// This returns true as a convenience for the parser functions that return
// true on error.
bool ParseError(LocTy ErrorLoc, const Twine &Msg) {
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index c01de4a..9eb31d7 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -13,8 +13,9 @@
#ifndef LLVM_ASMPARSER_LLPARSER_H
#define LLVM_ASMPARSER_LLPARSER_H
-#include "LLLexer.h"
#include "llvm/ADT/StringMap.h"
+#include "llvm/AsmParser/AsmParserContext.h"
+#include "llvm/AsmParser/LLLexer.h"
#include "llvm/AsmParser/NumberedValues.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/IR/Attributes.h"
@@ -177,6 +178,9 @@ namespace llvm {
// Map of module ID to path.
std::map<unsigned, StringRef> ModuleIdMap;
+ /// Keeps track of source locations for Values, BasicBlocks, and Functions.
+ AsmParserContext *ParserContext;
+
/// Only the llvm-as tool may set this to false to bypass
/// UpgradeDebuginfo so it can generate broken bitcode.
bool UpgradeDebugInfo;
@@ -189,10 +193,11 @@ namespace llvm {
public:
LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
ModuleSummaryIndex *Index, LLVMContext &Context,
- SlotMapping *Slots = nullptr)
+ SlotMapping *Slots = nullptr,
+ AsmParserContext *ParserContext = nullptr)
: Context(Context), OPLex(F, SM, Err, Context),
Lex(F, SM, Err, Context), M(M), Index(Index), Slots(Slots),
- BlockAddressPFS(nullptr) {}
+ BlockAddressPFS(nullptr), ParserContext(ParserContext) {}
bool Run(
bool UpgradeDebugInfo,
DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
diff --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h
index c900b79..22b0881 100644
--- a/llvm/include/llvm/AsmParser/Parser.h
+++ b/llvm/include/llvm/AsmParser/Parser.h
@@ -15,6 +15,7 @@
#include "llvm/ADT/STLFunctionalExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
#include "llvm/Support/Compiler.h"
#include <memory>
#include <optional>
@@ -62,7 +63,8 @@ parseAssemblyFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
/// parsing.
LLVM_ABI std::unique_ptr<Module>
parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
- LLVMContext &Context, SlotMapping *Slots = nullptr);
+ LLVMContext &Context, SlotMapping *Slots = nullptr,
+ AsmParserContext *ParserContext = nullptr);
/// Holds the Module and ModuleSummaryIndex returned by the interfaces
/// that parse both.
@@ -128,9 +130,9 @@ parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err);
LLVM_ABI std::unique_ptr<Module> parseAssembly(
MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
SlotMapping *Slots = nullptr,
- DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
- return std::nullopt;
- });
+ DataLayoutCallbackTy DataLayoutCallback =
+ [](StringRef, StringRef) { return std::nullopt; },
+ AsmParserContext *ParserContext = nullptr);
/// Parse LLVM Assembly including the summary index from a MemoryBuffer.
///
@@ -169,9 +171,9 @@ parseSummaryIndexAssembly(MemoryBufferRef F, SMDiagnostic &Err);
LLVM_ABI bool parseAssemblyInto(
MemoryBufferRef F, Module *M, ModuleSummaryIndex *Index, SMDiagnostic &Err,
SlotMapping *Slots = nullptr,
- DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
- return std::nullopt;
- });
+ DataLayoutCallbackTy DataLayoutCallback =
+ [](StringRef, StringRef) { return std::nullopt; },
+ AsmParserContext *ParserContext = nullptr);
/// Parse a type and a constant value in the given string.
///
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4f27d9f1..76b6c8e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -366,7 +366,7 @@ private:
protected:
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
: BaseT(DL) {}
- virtual ~BasicTTIImplBase() = default;
+ ~BasicTTIImplBase() override = default;
using TargetTransformInfoImplBase::DL;
@@ -821,13 +821,13 @@ public:
SimplifyAndSetOp);
}
- virtual std::optional<unsigned>
+ std::optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level) const override {
return std::optional<unsigned>(
getST()->getCacheSize(static_cast<unsigned>(Level)));
}
- virtual std::optional<unsigned>
+ std::optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override {
std::optional<unsigned> TargetResult =
getST()->getCacheAssociativity(static_cast<unsigned>(Level));
@@ -838,31 +838,31 @@ public:
return BaseT::getCacheAssociativity(Level);
}
- virtual unsigned getCacheLineSize() const override {
+ unsigned getCacheLineSize() const override {
return getST()->getCacheLineSize();
}
- virtual unsigned getPrefetchDistance() const override {
+ unsigned getPrefetchDistance() const override {
return getST()->getPrefetchDistance();
}
- virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
- unsigned NumStridedMemAccesses,
- unsigned NumPrefetches,
- bool HasCall) const override {
+ unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) const override {
return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
NumPrefetches, HasCall);
}
- virtual unsigned getMaxPrefetchIterationsAhead() const override {
+ unsigned getMaxPrefetchIterationsAhead() const override {
return getST()->getMaxPrefetchIterationsAhead();
}
- virtual bool enableWritePrefetching() const override {
+ bool enableWritePrefetching() const override {
return getST()->enableWritePrefetching();
}
- virtual bool shouldPrefetchAddressSpace(unsigned AS) const override {
+ bool shouldPrefetchAddressSpace(unsigned AS) const override {
return getST()->shouldPrefetchAddressSpace(AS);
}
diff --git a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
index 3950b95..7a6feda 100644
--- a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
+++ b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
@@ -42,7 +42,7 @@ public:
/// Create a pass configuration object to be used by addPassToEmitX methods
/// for generating a pipeline of CodeGen passes.
- virtual TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+ TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
/// Add passes to the specified pass manager to get the specified file
/// emitted. Typically this will involve several steps of code generation.
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index fee4bb1..e72801b 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -118,7 +118,7 @@ private:
// AsmPrinterHandler overrides.
public:
- virtual ~DebugHandlerBase() override;
+ ~DebugHandlerBase() override;
void beginModule(Module *M) override;
diff --git a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
index bc8dc1b..6da10d8 100644
--- a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
+++ b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
@@ -44,12 +44,11 @@ private:
StringRef FuncOrModName);
/// Override base class method to run on an llvm::MachineFunction
/// specifically.
- virtual void
- visitEveryInstruction(unsigned &DroppedCount,
- DenseMap<VarID, DILocation *> &InlinedAtsMap,
- VarID Var) override;
+ void visitEveryInstruction(unsigned &DroppedCount,
+ DenseMap<VarID, DILocation *> &InlinedAtsMap,
+ VarID Var) override;
/// Override base class method to run on DBG_VALUEs specifically.
- virtual void visitEveryDebugRecord(
+ void visitEveryDebugRecord(
DenseSet<VarID> &VarIDSet,
DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
StringRef FuncName, bool Before) override;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
index ea3f1a8..6701ae0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
@@ -40,14 +40,14 @@ public:
// A CSE config for fully optimized builds.
class LLVM_ABI CSEConfigFull : public CSEConfigBase {
public:
- virtual ~CSEConfigFull() = default;
+ ~CSEConfigFull() override = default;
bool shouldCSEOpc(unsigned Opc) override;
};
// Commonly used for O0 config.
class LLVM_ABI CSEConfigConstantOnly : public CSEConfigBase {
public:
- virtual ~CSEConfigConstantOnly() = default;
+ ~CSEConfigConstantOnly() override = default;
bool shouldCSEOpc(unsigned Opc) override;
};
@@ -118,7 +118,7 @@ class LLVM_ABI GISelCSEInfo : public GISelChangeObserver {
public:
GISelCSEInfo() = default;
- virtual ~GISelCSEInfo();
+ ~GISelCSEInfo() override;
void setMF(MachineFunction &MF);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
index 39ff90c..7a313f4 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
@@ -60,7 +60,7 @@ public:
Combiner(MachineFunction &MF, CombinerInfo &CInfo,
const TargetPassConfig *TPC, GISelValueTracking *VT,
GISelCSEInfo *CSEInfo = nullptr);
- virtual ~Combiner();
+ ~Combiner() override;
virtual bool tryCombineAll(MachineInstr &I) const = 0;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
index 2db66ba..17d656a 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
@@ -58,7 +58,7 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver {
public:
GISelValueTracking(MachineFunction &MF, unsigned MaxDepth = 6);
- ~GISelValueTracking() = default;
+ ~GISelValueTracking() override = default;
const MachineFunction &getMachineFunction() const { return MF; }
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 3d7ccd5..268025e7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -656,7 +656,7 @@ private:
IRT->addSuccessorWithProb(Src, Dst, Prob);
}
- virtual ~GISelSwitchLowering() = default;
+ ~GISelSwitchLowering() override = default;
private:
IRTranslator *IRT;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index cf65f34..5694079 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -21,7 +21,7 @@ class GISelObserverWrapper;
class LLVM_ABI InstructionSelector : public GIMatchTableExecutor {
public:
- virtual ~InstructionSelector();
+ ~InstructionSelector() override;
/// Select the (possibly generic) instruction \p I to only use target-specific
/// opcodes. It is OK to insert multiple instructions, but they cannot be
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index f6b0571..4bcbad7 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -35,6 +35,8 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -61,7 +63,7 @@ class MIREmbedder;
class SymbolicMIREmbedder;
extern llvm::cl::OptionCategory MIR2VecCategory;
-extern cl::opt<float> OpcWeight;
+extern cl::opt<float> OpcWeight, CommonOperandWeight, RegOperandWeight;
using Embedding = ir2vec::Embedding;
using MachineInstEmbeddingsMap = DenseMap<const MachineInstr *, Embedding>;
@@ -74,31 +76,114 @@ class MIRVocabulary {
friend class llvm::MIR2VecVocabLegacyAnalysis;
using VocabMap = std::map<std::string, ir2vec::Embedding>;
-private:
- // Define vocabulary layout - adapted for MIR
+ // MIRVocabulary Layout:
+ // +-------------------+-----------------------------------------------------+
+ // | Entity Type | Description |
+ // +-------------------+-----------------------------------------------------+
+ // | 1. Opcodes | Target specific opcodes derived from TII, grouped |
+ // | | by instruction semantics. |
+ // | 2. Common Operands| All common operand types, except register operands, |
+ // | | defined by MachineOperand::MachineOperandType enum. |
+ // | 3. Physical | Register classes defined by the target, specialized |
+ // | Reg classes | by physical registers. |
+ // | 4. Virtual | Register classes defined by the target, specialized |
+ // | Reg classes | by virtual and physical registers. |
+ // +-------------------+-----------------------------------------------------+
+
+ /// Layout information for the MIR vocabulary. Defines the starting index
+ /// and size of each section in the vocabulary.
struct {
size_t OpcodeBase = 0;
- size_t OperandBase = 0;
+ size_t CommonOperandBase = 0;
+ size_t PhyRegBase = 0;
+ size_t VirtRegBase = 0;
size_t TotalEntries = 0;
} Layout;
- enum class Section : unsigned { Opcodes = 0, MaxSections };
+ enum class Section : unsigned {
+ Opcodes = 0,
+ CommonOperands = 1,
+ PhyRegisters = 2,
+ VirtRegisters = 3,
+ MaxSections
+ };
ir2vec::VocabStorage Storage;
- mutable std::set<std::string> UniqueBaseOpcodeNames;
+ std::set<std::string> UniqueBaseOpcodeNames;
+ SmallVector<std::string, 24> RegisterOperandNames;
+
+ // Some instructions have optional register operands that may be NoRegister.
+ // We return a zero vector in such cases.
+ Embedding ZeroEmbedding;
+
+ // We have specialized MO_Register handling in the Register operand section,
+ // so we don't include it here. Also, no MO_DbgInstrRef for now.
+ static constexpr StringLiteral CommonOperandNames[] = {
+ "Immediate", "CImmediate", "FPImmediate", "MBB",
+ "FrameIndex", "ConstantPoolIndex", "TargetIndex", "JumpTableIndex",
+ "ExternalSymbol", "GlobalAddress", "BlockAddress", "RegisterMask",
+ "RegisterLiveOut", "Metadata", "MCSymbol", "CFIIndex",
+ "IntrinsicID", "Predicate", "ShuffleMask"};
+ static_assert(std::size(CommonOperandNames) == MachineOperand::MO_Last - 1 &&
+ "Common operand names size changed, update accordingly");
+
const TargetInstrInfo &TII;
- void generateStorage(const VocabMap &OpcodeMap);
+ const TargetRegisterInfo &TRI;
+ const MachineRegisterInfo &MRI;
+
+ void generateStorage(const VocabMap &OpcodeMap,
+ const VocabMap &CommonOperandMap,
+ const VocabMap &PhyRegMap, const VocabMap &VirtRegMap);
void buildCanonicalOpcodeMapping();
+ void buildRegisterOperandMapping();
/// Get canonical index for a machine opcode
unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
+ /// Get index for a common (non-register) machine operand
+ unsigned
+ getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const;
+
+ /// Get index for a register machine operand
+ unsigned getRegisterOperandIndex(Register Reg) const;
+
+ // Accessors for operand types
+ const Embedding &
+ operator[](MachineOperand::MachineOperandType OperandType) const {
+ unsigned LocalIndex = getCommonOperandIndex(OperandType);
+ return Storage[static_cast<unsigned>(Section::CommonOperands)][LocalIndex];
+ }
+
+ const Embedding &operator[](Register Reg) const {
+ // Reg is sometimes NoRegister (0) for optional operands. We return a zero
+ // vector in this case.
+ if (!Reg.isValid())
+ return ZeroEmbedding;
+ // TODO: Implement proper stack slot handling for MIR2Vec embeddings.
+ // Stack slots represent frame indices and should have their own
+ // embedding strategy rather than defaulting to register class 0.
+ // Consider: 1) Separate vocabulary section for stack slots
+ // 2) Stack slot size/alignment based embeddings
+ // 3) Frame index based categorization
+ if (Reg.isStack())
+ return ZeroEmbedding;
+
+ unsigned LocalIndex = getRegisterOperandIndex(Reg);
+ auto SectionID =
+ Reg.isPhysical() ? Section::PhyRegisters : Section::VirtRegisters;
+ return Storage[static_cast<unsigned>(SectionID)][LocalIndex];
+ }
+
public:
/// Static method for extracting base opcode names (public for testing)
static std::string extractBaseOpcodeName(StringRef InstrName);
- /// Get canonical index for base name (public for testing)
+ /// Get indices from opcode or operand names. These are public for testing.
+ /// String based lookups are inefficient and should be avoided in general.
unsigned getCanonicalIndexForBaseName(StringRef BaseName) const;
+ unsigned getCanonicalIndexForOperandName(StringRef OperandName) const;
+ unsigned getCanonicalIndexForRegisterClass(StringRef RegName,
+ bool IsPhysical = true) const;
/// Get the string key for a vocabulary entry at the given position
std::string getStringKey(unsigned Pos) const;
@@ -111,6 +196,14 @@ public:
return Storage[static_cast<unsigned>(Section::Opcodes)][LocalIndex];
}
+ const Embedding &operator[](MachineOperand Operand) const {
+ auto OperandType = Operand.getType();
+ if (OperandType == MachineOperand::MO_Register)
+ return operator[](Operand.getReg());
+ else
+ return operator[](OperandType);
+ }
+
// Iterator access
using const_iterator = ir2vec::VocabStorage::const_iterator;
const_iterator begin() const { return Storage.begin(); }
@@ -120,18 +213,25 @@ public:
MIRVocabulary() = delete;
/// Factory method to create MIRVocabulary from vocabulary map
- static Expected<MIRVocabulary> create(VocabMap &&Entries,
- const TargetInstrInfo &TII);
+ static Expected<MIRVocabulary>
+ create(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap, VocabMap &&PhyRegMap,
+ VocabMap &&VirtRegMap, const TargetInstrInfo &TII,
+ const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI);
/// Create a dummy vocabulary for testing purposes.
static Expected<MIRVocabulary>
- createDummyVocabForTest(const TargetInstrInfo &TII, unsigned Dim = 1);
+ createDummyVocabForTest(const TargetInstrInfo &TII,
+ const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI, unsigned Dim = 1);
/// Total number of entries in the vocabulary
size_t getCanonicalSize() const { return Storage.size(); }
private:
- MIRVocabulary(VocabMap &&Entries, const TargetInstrInfo &TII);
+ MIRVocabulary(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap,
+ VocabMap &&PhyRegMap, VocabMap &&VirtRegMap,
+ const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI);
};
/// Base class for MIR embedders
@@ -144,11 +244,13 @@ protected:
const unsigned Dimension;
/// Weight for opcode embeddings
- const float OpcWeight;
+ const float OpcWeight, CommonOperandWeight, RegOperandWeight;
MIREmbedder(const MachineFunction &MF, const MIRVocabulary &Vocab)
: MF(MF), Vocab(Vocab), Dimension(Vocab.getDimension()),
- OpcWeight(mir2vec::OpcWeight) {}
+ OpcWeight(mir2vec::OpcWeight),
+ CommonOperandWeight(mir2vec::CommonOperandWeight),
+ RegOperandWeight(mir2vec::RegOperandWeight) {}
/// Function to compute embeddings.
Embedding computeEmbeddings() const;
@@ -208,11 +310,11 @@ public:
class MIR2VecVocabLegacyAnalysis : public ImmutablePass {
using VocabVector = std::vector<mir2vec::Embedding>;
using VocabMap = std::map<std::string, mir2vec::Embedding>;
- VocabMap StrVocabMap;
- VocabVector Vocab;
+ std::optional<mir2vec::MIRVocabulary> Vocab;
StringRef getPassName() const override;
- Error readVocabulary();
+ Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab,
+ VocabMap &PhyRegVocabMap, VocabMap &VirtRegVocabMap);
protected:
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -275,4 +377,4 @@ MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
} // namespace llvm
-#endif // LLVM_CODEGEN_MIR2VEC_H \ No newline at end of file
+#endif // LLVM_CODEGEN_MIR2VEC_H
diff --git a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
index 770f1b3..5504896 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
@@ -37,7 +37,7 @@ public:
MachineModuleSlotTracker(const MachineModuleInfo &MMI,
const MachineFunction *MF,
bool ShouldInitializeAllMetadata = true);
- ~MachineModuleSlotTracker();
+ ~MachineModuleSlotTracker() override;
void collectMachineMDNodes(MachineMDNodeListType &L) const;
};
diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index fbb958cc..66cab3d 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -306,7 +306,7 @@ struct GlobalOutlinedFunction : public OutlinedFunction {
}
GlobalOutlinedFunction() = delete;
- ~GlobalOutlinedFunction() = default;
+ ~GlobalOutlinedFunction() override = default;
};
} // namespace outliner
diff --git a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
index c15bc67..0af4f47 100644
--- a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -75,7 +75,7 @@ namespace llvm {
public:
ResourcePriorityQueue(SelectionDAGISel *IS);
- ~ResourcePriorityQueue();
+ ~ResourcePriorityQueue() override;
bool isBottomUp() const override { return false; }
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 201dc68..0dcf400 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -559,6 +559,11 @@ m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F) {
}
template <typename T0_P, typename T1_P, typename T2_P>
+inline auto m_SelectLike(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+ return m_AnyOf(m_Select(Cond, T, F), m_VSelect(Cond, T, F));
+}
+
+template <typename T0_P, typename T1_P, typename T2_P>
inline Result_match<0, TernaryOpc_match<T0_P, T1_P, T2_P>>
m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) {
return m_Result<0>(
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 4eacbdc..26d7080 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -18,7 +18,6 @@
#include "llvm/ADT/PointerIntPair.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SparseMultiSet.h"
-#include "llvm/ADT/identity.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 822245f..f031353 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -280,7 +280,7 @@ protected:
unsigned Mode = 0);
public:
- virtual ~TargetRegisterInfo();
+ ~TargetRegisterInfo() override;
/// Return the number of registers for the function. (may overestimate)
virtual unsigned getNumSupportedRegs(const MachineFunction &) const {
diff --git a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
index 112ff6d..65ff1eb 100644
--- a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
@@ -223,7 +223,7 @@ public:
enum { TopQID = 1, BotQID = 2, LogMaxQID = 2 };
ConvergingVLIWScheduler() : Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
- virtual ~ConvergingVLIWScheduler() = default;
+ ~ConvergingVLIWScheduler() override = default;
void initialize(ScheduleDAGMI *dag) override;
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 0062cec..98df06a 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1449,6 +1449,9 @@ private:
/// every summary of a GV is synchronized.
bool WithDSOLocalPropagation = false;
+ /// Indicates that summary-based internalization and promotion has run.
+ bool WithInternalizeAndPromote = false;
+
/// Indicates that we have whole program visibility.
bool WithWholeProgramVisibility = false;
@@ -1653,6 +1656,9 @@ public:
bool withDSOLocalPropagation() const { return WithDSOLocalPropagation; }
void setWithDSOLocalPropagation() { WithDSOLocalPropagation = true; }
+ bool withInternalizeAndPromote() const { return WithInternalizeAndPromote; }
+ void setWithInternalizeAndPromote() { WithInternalizeAndPromote = true; }
+
bool withWholeProgramVisibility() const { return WithWholeProgramVisibility; }
void setWithWholeProgramVisibility() { WithWholeProgramVisibility = true; }
diff --git a/llvm/include/llvm/IRReader/IRReader.h b/llvm/include/llvm/IRReader/IRReader.h
index 790140f..00cf12d 100644
--- a/llvm/include/llvm/IRReader/IRReader.h
+++ b/llvm/include/llvm/IRReader/IRReader.h
@@ -15,6 +15,7 @@
#define LLVM_IRREADER_IRREADER_H
#include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Support/Compiler.h"
#include <memory>
@@ -50,19 +51,19 @@ getLazyIRFileModule(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
/// for it. Otherwise, attempt to parse it as LLVM Assembly and return
/// a Module for it.
/// \param DataLayoutCallback Override datalayout in the llvm assembly.
-LLVM_ABI std::unique_ptr<Module> parseIR(MemoryBufferRef Buffer,
- SMDiagnostic &Err,
- LLVMContext &Context,
- ParserCallbacks Callbacks = {});
+LLVM_ABI std::unique_ptr<Module>
+parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err, LLVMContext &Context,
+ ParserCallbacks Callbacks = {},
+ AsmParserContext *ParserContext = nullptr);
/// If the given file holds a bitcode image, return a Module for it.
/// Otherwise, attempt to parse it as LLVM Assembly and return a Module
/// for it.
/// \param DataLayoutCallback Override datalayout in the llvm assembly.
-LLVM_ABI std::unique_ptr<Module> parseIRFile(StringRef Filename,
- SMDiagnostic &Err,
- LLVMContext &Context,
- ParserCallbacks Callbacks = {});
+LLVM_ABI std::unique_ptr<Module>
+parseIRFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
+ ParserCallbacks Callbacks = {},
+ AsmParserContext *ParserContext = nullptr);
}
#endif
diff --git a/llvm/include/llvm/Support/AllocToken.h b/llvm/include/llvm/Support/AllocToken.h
new file mode 100644
index 0000000..e40d816
--- /dev/null
+++ b/llvm/include/llvm/Support/AllocToken.h
@@ -0,0 +1,68 @@
+//===- llvm/Support/AllocToken.h - Allocation Token Calculation -----*- C++ -*//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of AllocToken modes and shared calculation of stateless token IDs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ALLOCTOKEN_H
+#define LLVM_SUPPORT_ALLOCTOKEN_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <optional>
+
+namespace llvm {
+
+/// Modes for generating allocation token IDs.
+enum class AllocTokenMode {
+ /// Incrementally increasing token ID.
+ Increment,
+
+ /// Simple mode that returns a statically-assigned random token ID.
+ Random,
+
+ /// Token ID based on allocated type hash.
+ TypeHash,
+
+ /// Token ID based on allocated type hash, where the top half ID-space is
+ /// reserved for types that contain pointers and the bottom half for types
+ /// that do not contain pointers.
+ TypeHashPointerSplit,
+};
+
+/// The default allocation token mode.
+inline constexpr AllocTokenMode DefaultAllocTokenMode =
+ AllocTokenMode::TypeHashPointerSplit;
+
+/// Returns the AllocTokenMode from its canonical string name; if an invalid
+/// name was provided returns nullopt.
+LLVM_ABI std::optional<AllocTokenMode>
+getAllocTokenModeFromString(StringRef Name);
+
+/// Metadata about an allocation used to generate a token ID.
+struct AllocTokenMetadata {
+ SmallString<64> TypeName;
+ bool ContainsPointer;
+};
+
+/// Calculates stable allocation token ID. Returns std::nullopt for stateful
+/// modes that are only available in the AllocToken pass.
+///
+/// \param Mode The token generation mode.
+/// \param Metadata The metadata about the allocation.
+/// \param MaxTokens The maximum number of tokens (must not be 0)
+/// \return The calculated allocation token ID, or std::nullopt.
+LLVM_ABI std::optional<uint64_t>
+getAllocToken(AllocTokenMode Mode, const AllocTokenMetadata &Metadata,
+ uint64_t MaxTokens);
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_ALLOCTOKEN_H
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
index b1391cb0..077703c 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
@@ -16,6 +16,7 @@
#include "llvm/IR/Analysis.h"
#include "llvm/IR/PassManager.h"
+#include "llvm/Support/AllocToken.h"
#include <optional>
namespace llvm {
@@ -23,6 +24,7 @@ namespace llvm {
class Module;
struct AllocTokenOptions {
+ AllocTokenMode Mode = DefaultAllocTokenMode;
std::optional<uint64_t> MaxTokens;
bool FastABI = false;
bool Extended = false;
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 979f3b3e..e677cbf 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -21,6 +21,7 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Dominators.h"
#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Printable.h"
#include <cassert>
namespace llvm {
@@ -611,6 +612,10 @@ LLVM_ABI void InvertBranch(BranchInst *PBI, IRBuilderBase &Builder);
// br/brcond/unreachable/ret
LLVM_ABI bool hasOnlySimpleTerminator(const Function &F);
+/// Print BasicBlock \p BB as an operand or print "<nullptr>" if \p BB is a
+/// nullptr.
+LLVM_ABI Printable printBasicBlock(const BasicBlock *BB);
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index f9bf092..6f19a68 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -255,6 +255,12 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name,
if (!ContainedType)
return;
+ SmallVector<uint64_t> ArrayDimensions;
+ while (ArrayType *AT = dyn_cast<ArrayType>(ContainedType)) {
+ ArrayDimensions.push_back(AT->getNumElements());
+ ContainedType = AT->getElementType();
+ }
+
StringRef ElementName;
ElementType ET = toDXILElementType(ContainedType, IsSigned);
if (ET != ElementType::Invalid) {
@@ -271,6 +277,8 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name,
DestStream << "<" << ElementName;
if (const FixedVectorType *VTy = dyn_cast<FixedVectorType>(ContainedType))
DestStream << VTy->getNumElements();
+ for (uint64_t Dim : ArrayDimensions)
+ DestStream << "[" << Dim << "]";
DestStream << ">";
}
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index b573023..8da51d0 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4866,89 +4866,6 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
return nullptr;
}
-/// Look for the following pattern and simplify %to_fold to %identicalPhi.
-/// Here %phi, %to_fold and %phi.next perform the same functionality as
-/// %identicalPhi and hence the select instruction %to_fold can be folded
-/// into %identicalPhi.
-///
-/// BB1:
-/// %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ]
-/// %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ]
-/// ...
-/// %identicalPhi.next = select %cmp, %val, %identicalPhi
-/// (or select %cmp, %identicalPhi, %val)
-/// %to_fold = select %cmp2, %identicalPhi, %phi
-/// %phi.next = select %cmp, %val, %to_fold
-/// (or select %cmp, %to_fold, %val)
-///
-/// Prove that %phi and %identicalPhi are the same by induction:
-///
-/// Base case: Both %phi and %identicalPhi are equal on entry to the loop.
-/// Inductive case:
-/// Suppose %phi and %identicalPhi are equal at iteration i.
-/// We look at their values at iteration i+1 which are %phi.next and
-/// %identicalPhi.next. They would have become different only when %cmp is
-/// false and the corresponding values %to_fold and %identicalPhi differ
-/// (similar reason for the other "or" case in the bracket).
-///
-/// The only condition when %to_fold and %identicalPh could differ is when %cmp2
-/// is false and %to_fold is %phi, which contradicts our inductive hypothesis
-/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are
-/// always equal at iteration i+1.
-bool isSimplifierIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) {
- if (PN.getParent() != IdenticalPN.getParent())
- return false;
- if (PN.getNumIncomingValues() != 2)
- return false;
-
- // Check that only the backedge incoming value is different.
- unsigned DiffVals = 0;
- BasicBlock *DiffValBB = nullptr;
- for (unsigned i = 0; i < 2; i++) {
- BasicBlock *PredBB = PN.getIncomingBlock(i);
- if (PN.getIncomingValueForBlock(PredBB) !=
- IdenticalPN.getIncomingValueForBlock(PredBB)) {
- DiffVals++;
- DiffValBB = PredBB;
- }
- }
- if (DiffVals != 1)
- return false;
- // Now check that the backedge incoming values are two select
- // instructions with the same condition. Either their true
- // values are the same, or their false values are the same.
- auto *SI = dyn_cast<SelectInst>(PN.getIncomingValueForBlock(DiffValBB));
- auto *IdenticalSI =
- dyn_cast<SelectInst>(IdenticalPN.getIncomingValueForBlock(DiffValBB));
- if (!SI || !IdenticalSI)
- return false;
- if (SI->getCondition() != IdenticalSI->getCondition())
- return false;
-
- SelectInst *SIOtherVal = nullptr;
- Value *IdenticalSIOtherVal = nullptr;
- if (SI->getTrueValue() == IdenticalSI->getTrueValue()) {
- SIOtherVal = dyn_cast<SelectInst>(SI->getFalseValue());
- IdenticalSIOtherVal = IdenticalSI->getFalseValue();
- } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) {
- SIOtherVal = dyn_cast<SelectInst>(SI->getTrueValue());
- IdenticalSIOtherVal = IdenticalSI->getTrueValue();
- } else {
- return false;
- }
-
- // Now check that the other values in select, i.e., %to_fold and
- // %identicalPhi, are essentially the same value.
- if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN)
- return false;
- if (!(SIOtherVal->getTrueValue() == &IdenticalPN &&
- SIOtherVal->getFalseValue() == &PN) &&
- !(SIOtherVal->getTrueValue() == &PN &&
- SIOtherVal->getFalseValue() == &IdenticalPN))
- return false;
- return true;
-}
-
/// Given operands for a SelectInst, see if we can fold the result.
/// If not, this returns null.
static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
@@ -5124,14 +5041,7 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
std::optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL);
if (Imp)
return *Imp ? TrueVal : FalseVal;
- // Look for same PHIs in the true and false values.
- if (auto *TruePHI = dyn_cast<PHINode>(TrueVal))
- if (auto *FalsePHI = dyn_cast<PHINode>(FalseVal)) {
- if (isSimplifierIdenticalPHI(*TruePHI, *FalsePHI))
- return FalseVal;
- if (isSimplifierIdenticalPHI(*FalsePHI, *TruePHI))
- return TrueVal;
- }
+
return nullptr;
}
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6f7dd79..7597f3a 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3768,13 +3768,11 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
return getOrCreateAddRecExpr(Operands, L, Flags);
}
-const SCEV *
-ScalarEvolution::getGEPExpr(GEPOperator *GEP,
- const SmallVectorImpl<const SCEV *> &IndexExprs) {
+const SCEV *ScalarEvolution::getGEPExpr(GEPOperator *GEP,
+ ArrayRef<const SCEV *> IndexExprs) {
const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand());
// getSCEV(Base)->getType() has the same address space as Base->getType()
// because SCEV::getType() preserves the address space.
- Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
GEPNoWrapFlags NW = GEP->getNoWrapFlags();
if (NW != GEPNoWrapFlags::none()) {
// We'd like to propagate flags from the IR to the corresponding SCEV nodes,
@@ -3787,13 +3785,20 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
NW = GEPNoWrapFlags::none();
}
+ return getGEPExpr(BaseExpr, IndexExprs, GEP->getSourceElementType(), NW);
+}
+
+const SCEV *ScalarEvolution::getGEPExpr(const SCEV *BaseExpr,
+ ArrayRef<const SCEV *> IndexExprs,
+ Type *SrcElementTy, GEPNoWrapFlags NW) {
SCEV::NoWrapFlags OffsetWrap = SCEV::FlagAnyWrap;
if (NW.hasNoUnsignedSignedWrap())
OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNSW);
if (NW.hasNoUnsignedWrap())
OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNUW);
- Type *CurTy = GEP->getType();
+ Type *CurTy = BaseExpr->getType();
+ Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
bool FirstIter = true;
SmallVector<const SCEV *, 4> Offsets;
for (const SCEV *IndexExpr : IndexExprs) {
@@ -3812,7 +3817,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
if (FirstIter) {
assert(isa<PointerType>(CurTy) &&
"The first index of a GEP indexes a pointer");
- CurTy = GEP->getSourceElementType();
+ CurTy = SrcElementTy;
FirstIter = false;
} else {
CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0);
diff --git a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
index 0afe5c7..f026b0d 100644
--- a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
+++ b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
@@ -1,5 +1,5 @@
{
- "entities" : {
+ "Opcodes" : {
"ABS_Fp":[0.07323841750621796, -0.006006906274706125, 0.09751169383525848, -0.011089739389717579, 0.06642112135887146, -0.015824640169739723, -0.021592319011688232, -0.0035401300992816687, 0.06047678738832474, -0.007392085622996092, 0.07134906202554703, -0.019624482840299606, -0.10975595563650131, -0.007685789838433266, 0.07451746612787247, 0.06384266912937164, -0.08230067789554596, 0.050922468304634094, 0.013724055141210556, 0.015687907114624977, -0.018451329320669174, 0.046987198293209076, -0.037734340876340866, -0.07235030829906464, 0.10218106210231781, 0.08037368208169937, -0.029537858441472054, -0.047520823776721954, -0.022125739604234695, -0.03125226870179176, -0.02882847562432289, 0.013811410404741764, 0.0023568253964185715, 0.017958490177989006, -0.05359291657805443, -0.03606243059039116, 0.07840022444725037, -0.016711654141545296, -0.038644544780254364, 0.05886651948094368, -0.011418955400586128, -0.04882095381617546, 0.04027162492275238, 0.001088760793209076, 0.03045983798801899, -0.10998888313770294, -0.0097441291436553, 0.015445191413164139, 0.030951637774705887, -0.06309321522712708, -0.019475746899843216, -0.029662512242794037, 0.05312168970704079, 0.05355998873710632, 0.05060160160064697, -0.053278811275959015, -0.01803833432495594, 0.010853713378310204, -0.053911495953798294, 0.06630647927522659, -0.08671313524246216, 0.0699775293469429, -0.08346731215715408, -0.045348167419433594, 0.06779918074607849, 0.008865933865308762, 0.05460203066468239, 0.007126103155314922, 0.0012282058596611023, 0.06817980855703354, 0.0216530654579401, 0.03552381321787834, 0.015414077788591385, -0.06002715229988098, 0.05233345925807953, 0.0782286673784256, 0.04220856353640556, -0.005762201733887196, 0.004772072657942772, 0.004578332882374525, 0.002619141712784767, 0.024511393159627914, -0.10089710354804993, 0.018322769552469254, 0.020811809226870537, -0.03358744457364082, -0.06896928697824478, -0.007399350870400667, -0.044467780739068985, -0.08094192296266556, -0.09795571863651276, 0.08391229063272476, -0.04749457910656929, 0.0029586481396108866, -5.354872337193228e-05, 0.005788655485957861, 0.015252145007252693, 0.06928747892379761, 0.041780371218919754, 0.016391364857554436],
"ADC":[-0.07533542811870575, -0.01729339174926281, 0.04298720881342888, 0.015697332099080086, -0.04403507336974144, -0.059322185814380646, -0.050977922976017, 0.027526788413524628, -0.07009710371494293, -0.025621667504310608, 0.0352291613817215, -0.011538374237716198, 0.03682859241962433, -0.09788215160369873, -0.07216927409172058, -0.03659192472696304, 0.05676230415701866, -0.06369645893573761, -0.04756825789809227, 0.005865555722266436, 0.022270306944847107, -0.042112063616514206, 0.07008901983499527, 0.07748222351074219, -0.1020870953798294, -0.008511601015925407, -0.05725255608558655, -0.07881367206573486, 0.05627593398094177, -0.0005361076910048723, 0.03351512551307678, 0.04348289221525192, -0.08322969079017639, -0.02161242999136448, -0.07805898040533066, 0.04819482937455177, -0.061123576015233994, -0.010114834643900394, -0.04676959663629532, -0.008176938630640507, 0.010575453750789165, -0.04312445595860481, 0.00376943894661963, -0.0691257119178772, 0.03553615137934685, 0.10397598147392273, 0.009375158697366714, 0.001147320494055748, 0.026351911947131157, -0.0194610096514225, -0.05202522128820419, 0.014047946780920029, -0.040036872029304504, 0.06963572651147842, 0.04827437922358513, -0.06908547878265381, 0.024857567623257637, -0.03304143249988556, 0.02291242778301239, 0.07687342166900635, -0.05110599845647812, -0.00873416755348444, 0.026205750182271004, 0.045064594596624374, -0.03565925359725952, 0.09580051153898239, -0.02518773265182972, 0.047807395458221436, -0.03548192232847214, 0.08286304026842117, -0.053511787205934525, 0.02892065793275833, -0.0495525486767292, 0.02590095065534115, -0.006982128601521254, 0.006042638327926397, -0.07269058376550674, 0.02401554025709629, -0.05660006031394005, -0.026029467582702637, 0.05318204686045647, 0.06714116781949997, -0.0023821850772947073, 0.05028798058629036, -0.005811943672597408, -0.003296421840786934, -0.005409242119640112, -0.10150349885225296, -0.06406981498003006, 0.02553202211856842, -0.002790689468383789, 0.0663856491446495, 0.09109167754650116, -0.04678672179579735, 0.022019781172275543, 0.007821275852620602, 0.022490357980132103, -0.058503177016973495, 0.08841150254011154, -0.00892670825123787],
"ADD":[-0.037626221776008606, 0.006784931290894747, 0.10051396489143372, -0.0014993306249380112, -0.0323498398065567, -0.03148593008518219, -0.014100957661867142, -0.020252650603652, 0.014126972295343876, -0.1295478343963623, 0.08520576357841492, -0.02513248659670353, 0.03539956361055374, -0.07019674777984619, -0.019069846719503403, 0.016678515821695328, -0.009174983017146587, -0.019034702330827713, -0.024083402007818222, -0.07829779386520386, -0.007908892817795277, -0.07924024760723114, -0.034599609673023224, 0.05271153524518013, 0.0016642026603221893, -0.03938138112425804, 0.0019624519627541304, 0.03562740981578827, 0.07340876758098602, 0.09457183629274368, -0.06507840752601624, 0.00246993126347661, -0.004548616707324982, 0.058226197957992554, -0.021043049171566963, -0.0599520243704319, -0.03138553351163864, 0.03265950828790665, 0.004963710438460112, -0.003248866181820631, -0.04021746292710304, 0.038208190351724625, -0.02256007120013237, 0.10770396143198013, 0.013757425360381603, 0.040707558393478394, -0.00694271270185709, -0.012331271544098854, 0.004992029629647732, -0.032236646860837936, 0.01055158581584692, 0.04604483023285866, 0.09973260760307312, 0.07322807610034943, 0.06853726506233215, 0.004230210557579994, -0.04007832333445549, 0.16341225802898407, -0.01683313027024269, -0.01998194307088852, -0.035919081419706345, -0.055582448840141296, 0.008072910830378532, -0.0054771858267486095, -0.013343624770641327, 0.014230597764253616, -0.06542462855577469, 0.015897123143076897, -0.06011642515659332, 0.07983837276697159, 0.026512078940868378, 0.014883842319250107, -0.015171286650002003, 4.1508101276122034e-05, -0.048078570514917374, -0.052594274282455444, -0.07897629588842392, -0.01334046758711338, -0.06180298700928688, 0.022423526272177696, 0.07393807917833328, 0.022332284599542618, 0.04279463365674019, 0.04075624793767929, 0.007524204906076193, -0.024405587464571, 0.0011822516098618507, -0.0019135301699861884, 0.10789427906274796, -0.040499038994312286, 0.011574117466807365, 0.048836030066013336, 0.0380941741168499, -0.047072283923625946, -0.01285380870103836, -0.038019485771656036, -0.06277137994766235, -0.0034404860343784094, -0.031123748049139977, 0.04279843345284462],
@@ -47,7 +47,6 @@
"CVTSS":[-0.06638028472661972, -0.011326023377478123, 0.008208844810724258, 0.007368308026343584, 0.009791173972189426, -0.03396046161651611, 0.02250068075954914, -0.057750262320041656, -0.04949551820755005, 0.02559898979961872, -0.025012727826833725, -0.05923935025930405, 0.005058884620666504, 0.008716589771211147, -0.017511164769530296, -0.07095059007406235, -0.06573225557804108, -0.028140492737293243, 0.11092227697372437, 0.02664722129702568, -0.01997300609946251, 0.0798712745308876, -0.022800235077738762, 0.09157945215702057, 0.025709187611937523, -0.09037603437900543, -0.07092109322547913, -0.04094154015183449, -0.025702493265271187, 0.015247789211571217, 0.06089004501700401, 0.051023274660110474, -0.04670926183462143, 0.04763137549161911, -0.035940639674663544, 0.002320673782378435, -0.005764417815953493, -0.07975194603204727, -0.0038822791539132595, 0.06728507578372955, 0.020742014050483704, 0.08809743821620941, -0.061493389308452606, -0.0485445000231266, -0.022268671542406082, 0.08475345373153687, -0.0030403153505176306, -0.05737586319446564, -0.07930854707956314, -0.01657176949083805, 0.04658877104520798, 0.005716703832149506, -0.04288295656442642, -0.08686209470033646, -0.07359853386878967, 0.02947128191590309, -0.03684910386800766, -0.03841136023402214, 0.01288131158798933, -0.04918907582759857, -0.05579863488674164, 0.06267702579498291, -0.0034505922812968493, 0.034628838300704956, 0.04280426353216171, 0.042202845215797424, 0.012274117209017277, 0.025021208450198174, -0.07867497205734253, 0.03826712444424629, 0.017088277265429497, 0.037250861525535583, -0.016143174842000008, -0.06754780560731888, -0.013957766816020012, 0.1060054823756218, 0.014829001389443874, 0.06808885931968689, 0.022929415106773376, -0.10870063304901123, -0.002258410444483161, 0.009293666109442711, 0.08529872447252274, -0.018672339618206024, -0.06721168756484985, 0.04180533438920975, -0.0031767592299729586, -0.023869113996624947, -0.00011912015179404989, -0.034519728273153305, 0.0022619885858148336, -0.00573525857180357, -0.033912476152181625, 0.059763263911008835, -0.048703599721193314, -0.07433722168207169, 0.04105979949235916, 0.0022583131212741137, 0.03093089908361435, -0.05187990516424179],
"CVTTSD":[-0.08537309616804123, 0.0010597433429211378, 0.07481679320335388, 0.05997887998819351, -0.0376993790268898, 0.10309506952762604, 0.07795511186122894, 0.0833413377404213, 0.056095756590366364, 0.05851535126566887, -0.057075001299381256, 0.020756129175424576, -0.08901876956224442, 0.02559811621904373, -0.016971183940768242, -0.04282280057668686, -0.005386374890804291, -0.06672719866037369, -0.09664622694253922, 0.06042492762207985, -0.042353514581918716, 0.06194235011935234, -0.025712836533784866, -0.029526079073548317, 0.044016264379024506, 0.036507125943899155, -0.038406822830438614, 0.006118632387369871, -0.0495009683072567, -0.07487531006336212, -0.07304015755653381, 0.042621925473213196, -0.06314127147197723, 0.03934277594089508, -0.09373295307159424, -0.05887934938073158, 0.010626542381942272, -0.050934500992298126, -0.037448156625032425, 0.01178495679050684, -0.07045318186283112, 0.10210251808166504, -0.07279546558856964, 0.04947654530405998, -0.039519909769296646, 0.07030976563692093, -0.011039734818041325, 0.01187387015670538, -0.0840335488319397, -0.005615191534161568, -0.06869980692863464, -0.012282256036996841, -0.013054385781288147, -0.0711965560913086, 0.015505223535001278, 0.0693473145365715, 0.012862266041338444, -0.04747828096151352, 0.023439936339855194, 0.03891129791736603, -0.04998844489455223, -0.04673001170158386, 0.02121424488723278, 0.0501207634806633, 0.07420068979263306, -0.014888633042573929, 0.007586659397929907, 0.01340668834745884, -0.09216003119945526, 0.09335170686244965, 0.023272672668099403, 0.030810026451945305, 0.05792044475674629, -0.020374637097120285, -0.02717672660946846, 0.028085753321647644, 0.08691198378801346, 0.061656054109334946, -0.07689087092876434, 0.0407567173242569, 0.010403914377093315, -0.03389676660299301, 0.07075867801904678, 0.002534526167437434, -0.026066122576594353, 0.005012217443436384, 0.08335569500923157, -0.02732011303305626, 0.03854125738143921, 0.03336648270487785, -0.10646265000104904, -0.003997548017650843, 0.09871185570955276, 0.0275016650557518, 0.015653448179364204, 0.07066125422716141, 0.05811227858066559, 0.046357106417417526, 0.047027964144945145, 0.07407277077436447],
"CVTTSS":[-0.07762601226568222, 0.051891617476940155, 0.02840222790837288, 0.012996217235922813, -0.04709569737315178, -0.011790127493441105, 0.07787185907363892, 0.07411551475524902, 0.04010153189301491, 0.000911108567379415, -0.09610971063375473, 0.042953960597515106, 0.01613607630133629, -0.07504888623952866, -0.04967263713479042, 0.06148393824696541, -0.018901845440268517, 0.08033818751573563, -0.06893469393253326, -0.036083199083805084, 0.08206851035356522, 0.08462843298912048, 0.06728347390890121, -0.03210798278450966, -0.019102206453680992, 0.0723310112953186, 0.009836986660957336, -0.057902153581380844, 0.007954364642500877, -0.015247606672346592, 0.08317636698484421, -0.030078981071710587, -0.003329804167151451, -0.00047014118172228336, -0.02859017252922058, -0.07635723054409027, -0.008230162784457207, 0.03107159025967121, -0.009525406174361706, 0.06515175849199295, -0.06525594741106033, -0.028639627620577812, -0.0781184732913971, 0.009911812841892242, 0.011008340865373611, -0.04294031485915184, -0.04256690293550491, -1.394751961925067e-05, -0.029347950592637062, -0.031849224120378494, 0.012988862581551075, -0.0009693846222944558, -0.019299298524856567, 0.0045416890643537045, -0.04690401256084442, -0.04800841212272644, 0.0020325451623648405, -0.02004505693912506, 0.04130777344107628, -0.033602941781282425, -0.06956057250499725, -0.008079515770077705, 0.0033002288546413183, 0.03853915259242058, 0.08760882169008255, -0.04805464297533035, 0.02319355681538582, 0.018974801525473595, -0.08521144837141037, -0.05224936082959175, -0.023577861487865448, 0.01627342589199543, 0.024244949221611023, 0.09439363330602646, -0.007235093507915735, 0.055853620171546936, -0.00885567907243967, 0.02217228338122368, 0.05414341762661934, -0.0278383269906044, -0.000764147553127259, 0.045272815972566605, -0.009049531072378159, 0.05590446665883064, -0.05074811726808548, -0.06311893463134766, -0.026139337569475174, 0.01067473366856575, -0.043730076402425766, -0.07134802639484406, -0.11087869852781296, 0.05522335693240166, -0.07894640415906906, -0.06710508465766907, -0.022497203201055527, 0.0777427926659584, -0.07944057136774063, 0.05494234338402748, -0.04788406938314438, -0.032921578735113144],
- "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232],
"DEC":[0.0634445771574974, -0.06605149805545807, 0.03212125599384308, 0.030006375163793564, -0.08837386220693588, -0.016591178253293037, -0.03157195448875427, 0.005282422062009573, 0.04301748052239418, -0.035375431180000305, -0.050481121987104416, -0.10733921080827713, -0.03802337497472763, -0.0745977833867073, -0.03943190351128578, -0.014895747415721416, 0.004689200781285763, -0.05872263386845589, -0.02043316885828972, 0.017881838604807854, -0.02151746302843094, 0.049130357801914215, -0.0980888232588768, -0.0012140831677243114, -0.03892286866903305, -0.050167523324489594, -0.06817777454853058, 0.011282221414148808, 0.0848090872168541, -0.04859968274831772, -0.005405630450695753, 0.09327276051044464, -0.031913015991449356, -0.07784294337034225, -0.039762917906045914, -0.0004000961489509791, -0.03763844072818756, -0.024915525689721107, 0.04509890824556351, 0.05546657368540764, -0.055939678102731705, -0.0467451736330986, -0.030023904517292976, -0.010519847273826599, 0.009574057534337044, 0.023444844409823418, 0.007250144146382809, 0.060414351522922516, -0.0011268716771155596, -0.10112253576517105, -0.068567655980587, -0.044332459568977356, 0.0022569731809198856, -0.012019195593893528, 0.0016708170296624303, 0.01029527559876442, -0.024694599211215973, -0.0428738109767437, 0.053816765546798706, 0.09999147802591324, 0.06608963757753372, 0.014324366115033627, 0.022997796535491943, -0.012565241195261478, -0.008212191984057426, -0.012308428063988686, -0.09830988198518753, -0.04177428036928177, 0.03759279474616051, 0.06749766319990158, -0.08330990374088287, -0.06375840306282043, 0.0471678152680397, 0.06524914503097534, 0.09668447077274323, 0.07395336031913757, -0.06081546097993851, 0.0322561152279377, -0.05461571738123894, 0.022349894046783447, 0.0981096625328064, 0.019211066886782646, 0.10566835105419159, 0.004508140496909618, 0.030159158632159233, 0.1076640635728836, -0.004145997576415539, 0.08043811470270157, 0.030684711411595345, 0.07909402251243591, -0.015952520072460175, 0.027102122083306313, 0.017120881006121635, 0.0860346332192421, 0.06145261228084564, -0.01827210932970047, 0.027506740763783455, 0.08201386034488678, -0.09402544051408768, -0.07927247136831284],
"DIV":[0.08121486008167267, -0.06398852169513702, -0.007856910116970539, 0.09644383192062378, 0.0013691268395632505, 0.03523438796401024, -0.04342259466648102, -0.011761687695980072, 0.021194210276007652, -0.0386938601732254, -0.004948849324136972, -0.08348845690488815, 0.005121953319758177, -0.06682730466127396, -0.004115825518965721, 0.015023703686892986, 0.042783256620168686, 0.08872916549444199, -0.03392689675092697, -0.014770613051950932, 0.001988545060157776, -0.05145770683884621, -0.029310323297977448, 0.06324473023414612, -0.08066411316394806, 0.006997138261795044, 0.004352204035967588, -0.060964930802583694, 0.02948148362338543, 0.052747759968042374, -0.05635778605937958, -0.014655586332082748, 0.015838103368878365, -0.04539657384157181, 0.031915292143821716, 0.05234432592988014, -0.012030252255499363, 0.06431112438440323, -0.027869969606399536, -0.006431832443922758, 0.025956276804208755, 0.047651831060647964, -0.01758543774485588, 0.07249220460653305, -0.049627624452114105, -0.007435495033860207, 0.0015833197394385934, 2.190603845519945e-05, 0.03457536920905113, 0.03895196691155434, -0.037442032247781754, 0.003120564157143235, -0.0690622553229332, -0.04405339062213898, 0.016464274376630783, -0.05068953335285187, 0.009520933963358402, 0.05033525079488754, 0.030095860362052917, 0.08773164451122284, -0.03623930364847183, -0.0076989103108644485, 0.0133424773812294, 0.025229837745428085, 0.018198521807789803, 0.011319941841065884, -0.005582685582339764, -0.03598775342106819, -0.0565820187330246, 0.08609189838171005, 0.035601116716861725, -0.007436969317495823, -0.018040914088487625, -0.04825054481625557, -0.014956142753362656, 0.03343576192855835, -0.0739198625087738, 0.038971979171037674, -0.03691745549440384, -0.0371851809322834, 0.08137080073356628, 0.03924981504678726, -0.06499960273504257, 0.047913506627082825, -0.0464070662856102, 0.04404731094837189, -0.03972303494811058, 0.03341617435216904, 0.05367732420563698, -0.04457789286971092, -0.07455608248710632, 0.007865827530622482, 0.04562194645404816, -0.03552774339914322, -0.007738951593637466, 0.09388759732246399, -0.015701837837696075, 0.033921483904123306, -0.017276542261242867, 0.04943705350160599],
"DIVPDrm":[0.04179735854268074, 0.008989601396024227, 0.0027430830523371696, 0.06804384291172028, -0.06657993793487549, 0.033647675067186356, -0.03707171231508255, 0.08443991839885712, -0.054565757513046265, 0.0765392854809761, -0.08189049363136292, 0.02573087066411972, 0.018917549401521683, 0.079402856528759, -0.011117411777377129, 0.06308865547180176, -0.045432765036821365, -0.05054701492190361, -0.009618235751986504, -0.0594516322016716, 0.07967120409011841, 0.08030137419700623, -0.0768255814909935, -0.061036787927150726, 0.004279104992747307, -0.09737113863229752, 0.07295801490545273, -0.027599459514021873, 0.0045133912935853004, -0.048141367733478546, 0.0003157609316986054, -0.014835191890597343, 0.01462356187403202, -0.03225003555417061, 0.06723359227180481, 0.05244021862745285, 0.07099424302577972, -0.09206876158714294, 0.06154841184616089, -0.022400988265872, 0.034042902290821075, 0.002528816694393754, -0.04578591138124466, -0.023195132613182068, -0.07696253061294556, -0.03475971147418022, 0.03545870631933212, -0.021839862689375877, 0.0036371496971696615, 0.07372148334980011, -0.0596211701631546, -0.06768393516540527, -0.032637521624565125, 0.008432515896856785, 0.007569535635411739, -0.0034237385261803865, 0.05811845883727074, 0.013580343686044216, -0.03924565017223358, -0.025963587686419487, 0.03800642117857933, -0.04651957005262375, -0.033428385853767395, -0.053251899778842926, -0.04647624120116234, -0.034290049225091934, 0.003906013211235404, -0.05534028634428978, 0.04434804245829582, -0.08216925710439682, -0.011801591143012047, -0.006801240611821413, -0.07483590394258499, -0.06332433968782425, -0.005107037723064423, -0.008274846710264683, -0.07277056574821472, 0.03865613043308258, -0.0472225658595562, 0.009775533340871334, 0.055412523448467255, -0.014846398495137691, -0.008565607480704784, -0.018367087468504906, 0.038180120289325714, 0.06085506081581116, -0.02658388763666153, 0.006586031056940556, 0.0761575847864151, -0.007659312803298235, -0.10445686429738998, 0.01846102997660637, 0.02885548584163189, 0.0437043160200119, -0.012576445005834103, 0.04055696353316307, 0.002144219819456339, -0.08052077144384384, 0.03422001749277115, 0.03888843208551407],
@@ -62,12 +61,9 @@
"DIV_Fp":[0.0013771128142252564, -0.03939857333898544, 0.06826473772525787, -0.055852942168712616, 0.021110225468873978, -0.07429434359073639, -0.01439732313156128, 0.047745198011398315, 0.03544871136546135, -0.006474921014159918, -0.05228240415453911, 0.00804696511477232, 0.0025021089240908623, 0.049810487776994705, -0.009595588780939579, 0.0507207065820694, -0.040155258029699326, 0.013851179741322994, -0.09630413353443146, -0.012529753148555756, 0.08176414668560028, 0.05994131416082382, 0.0013053410220891237, -0.035347871482372284, -0.06649265438318253, 0.07997933030128479, -0.042037565261125565, -0.06072461977601051, 0.09246786683797836, -0.0072363922372460365, 0.01850724034011364, 0.03905143961310387, -0.07601091265678406, -0.04824458062648773, -0.014410853385925293, -0.06455439329147339, -0.0593516007065773, -0.047922395169734955, -0.07904111593961716, -0.05896637961268425, -0.05629009008407593, -0.08674604445695877, 0.017179397866129875, -0.0020149857737123966, 0.02413070574402809, 0.024688012897968292, 0.027266085147857666, -0.015890855342149734, -0.00813567265868187, 0.024672919884324074, -0.020992467179894447, 0.019298823550343513, 0.022587062790989876, 0.06570186465978622, 0.061541132628917694, -0.07291612029075623, -0.010421186685562134, 0.032753147184848785, -0.06230449676513672, 0.040921296924352646, 0.05855383351445198, -0.035908423364162445, 0.05353318154811859, -0.013773049227893353, 0.0073576089926064014, 0.016397720202803612, 0.03753839433193207, 0.04765179380774498, -0.041083212941884995, -0.013994180597364902, -0.015261827036738396, 0.0982649177312851, 0.05605688691139221, -0.041869863867759705, -0.017181048169732094, 0.03721241280436516, -0.005489564035087824, 0.026647603139281273, -0.07785916328430176, 0.0476430244743824, -0.006558667402714491, 0.06363014876842499, -0.05705825239419937, -0.048359137028455734, -0.09657922387123108, -0.020021332427859306, 0.05151694640517235, 0.0028305412270128727, -0.012787899002432823, -0.09800048917531967, 0.01322718895971775, 0.08181536942720413, -0.04321233555674553, -0.0016350646037608385, -0.03537006303668022, 0.041411954909563065, 0.028577959164977074, 0.01855066418647766, 0.01671769842505455, -0.04467424377799034],
"DIV_FpI":[-0.010708109475672245, -0.0732470378279686, -0.033443547785282135, -0.06361733376979828, 0.017653197050094604, -0.030770231038331985, -0.0766882598400116, 0.08713997155427933, -0.0696694403886795, 0.0565333366394043, 0.0079630296677351, 0.009157304652035236, 0.07795052230358124, 0.00863052811473608, 0.009487103670835495, -0.021366223692893982, -0.08859013020992279, -0.052845098078250885, 0.07517100870609283, 0.030445149168372154, -0.031006425619125366, -0.011518558487296104, 0.031634584069252014, 0.006774903275072575, -0.0008412582101300359, 0.05720775946974754, -0.03664165362715721, 0.04671872407197952, -0.04702712222933769, 0.08346223086118698, -0.02042539417743683, 0.005731453187763691, -0.02509506233036518, 0.04370206221938133, -0.06398718804121017, 0.052075039595365524, 0.05920809134840965, 0.0037172543816268444, 0.07034561783075333, -0.018100138753652573, 0.002755390014499426, -0.07121799886226654, 0.03879084065556526, 0.013516174629330635, -0.02845778502523899, 0.019500035792589188, 0.014439111575484276, 0.06561631709337234, -0.10264755040407181, -0.016511712223291397, -0.018063146620988846, 0.08819841593503952, -0.0031949833501130342, -0.07884415239095688, -0.10739012062549591, 0.007700629997998476, 0.049550142139196396, 0.0866587832570076, 0.054501283913850784, 0.10046342760324478, 0.01546743419021368, -0.05334487929940224, -0.02652003802359104, -0.009483176283538342, 0.011785115115344524, 0.04965313896536827, -0.030048802495002747, -0.043639082461595535, -0.004809096921235323, -0.0515226274728775, 0.08381897956132889, 0.003956930246204138, 0.03591177612543106, 0.04015829414129257, -0.03484338894486427, 0.04027436673641205, 0.09792015701532364, 0.013287014327943325, 0.09490979462862015, -0.024792836979031563, 0.04872164502739906, 0.026059577241539955, -0.05917894095182419, -0.011415015906095505, -0.024944854900240898, 0.00499876169487834, -0.06721813976764679, -0.03442658111453056, -0.002175490139052272, 0.0004200722905807197, -0.10891042649745941, 0.021674668416380882, 0.03700282797217369, -0.0014478170778602362, -0.013477527536451817, -0.02742406167089939, -0.01233668439090252, 0.02371596358716488, 0.04435785114765167, -0.03723753243684769],
"EH_LABEL":[-0.03541884943842888, 0.012697970494627953, 0.07690317928791046, 0.10800164937973022, -0.033531446009874344, 0.010248170234262943, 0.08690237253904343, -0.018254421651363373, 0.006330807693302631, 0.054908059537410736, 0.05281105265021324, 0.01866377331316471, -0.03986826166510582, 0.012461063452064991, -0.0570770688354969, 0.010465170256793499, -0.0007985670818015933, 0.014928294345736504, -0.08143509179353714, -0.04576095566153526, -0.014382844790816307, 0.09261113405227661, -0.06843073666095734, 0.08642790466547012, 0.010645134374499321, 0.02887858636677265, -0.08228367567062378, 0.06679805368185043, 0.0023300996981561184, 0.02060936950147152, 0.06778941303491592, 0.10305429995059967, 0.06289057433605194, -0.020899023860692978, -0.024045836180448532, 0.0433543361723423, -0.0338776558637619, -0.05156976729631424, -0.02495928853750229, -0.060252029448747635, -0.1022094339132309, 0.014480574987828732, 0.0545940026640892, 0.04824232682585716, 0.06658189743757248, 0.11414545774459839, 0.08304191380739212, -0.03313761577010155, -0.056730128824710846, -0.005165864247828722, -0.00412571569904685, -0.0007486011018045247, -0.03322497382760048, 0.0425163209438324, 0.0785580724477768, 0.015084332786500454, 0.049294766038656235, 0.07518152892589569, -0.008224403485655785, -0.08819448202848434, -0.020814890041947365, 0.054976895451545715, -0.06431052833795547, 0.026952102780342102, -0.02861913852393627, -0.05228573456406593, -0.08044329285621643, 0.02844928950071335, 0.06669115275144577, -0.005387885496020317, 0.05081101134419441, -0.0627083107829094, -0.0785573348402977, -0.042252350598573685, -0.0632990375161171, -0.042457811534404755, -0.07097408920526505, 0.0032806433737277985, 0.039354246109724045, 0.054314617067575455, 0.04231691733002663, 0.00793430395424366, -0.06007056310772896, -0.06129273772239685, -0.008646488189697266, -0.024291129782795906, -0.06316813081502914, 0.02861824445426464, 0.029990797862410545, -0.014714590273797512, 0.005561451427638531, 0.06847269088029861, 0.05630529299378395, -0.015434914268553257, 0.08646618574857712, -0.0025325058959424496, -0.0046173883602023125, -0.04263639822602272, -0.021261105313897133, 0.02382785640656948],
- "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833],
"FLDCW":[-0.0138143515214324, 0.021748993545770645, 7.070673746056855e-05, -0.0897645577788353, 0.09824047237634659, -0.07988506555557251, -0.03454058617353439, 0.0019847718067467213, 0.04983500763773918, 0.03934836760163307, -0.01007675752043724, -0.07798215001821518, -0.08095540851354599, 0.002752745756879449, 0.030696945264935493, 0.017224561423063278, 0.00200466881506145, 0.055515315383672714, -0.06178406998515129, -0.07683275640010834, 0.06503588706254959, 0.06047344580292702, 0.017141321673989296, -0.021984437480568886, -0.05550537258386612, -0.10371828079223633, 0.04531969875097275, 0.04299109801650047, 0.008607891388237476, -0.015554985031485558, -0.08462150394916534, 0.01943030022084713, -0.03486369550228119, -0.06457459926605225, -0.0051103211008012295, 0.05992105230689049, 0.0358397401869297, -0.04655934497714043, -0.018018357455730438, -0.057540085166692734, 0.0061888862401247025, -0.013676634058356285, -0.05362136662006378, 0.06076344475150108, 0.014500541612505913, 0.04466172680258751, 0.025775697082281113, 0.034106262028217316, -0.045596618205308914, 0.022729532793164253, 0.0068075573071837425, 0.033541467040777206, 0.04034329950809479, -0.05922241508960724, -0.11147011071443558, 0.10801365971565247, 0.028543133288621902, -0.076783187687397, 0.0018997815204784274, -0.030598029494285583, 0.04199691861867905, -0.09739390015602112, 0.06310229748487473, -0.03830089420080185, -0.03836864233016968, 0.02324736677110195, 0.10289694368839264, -0.08237223327159882, 0.09511970728635788, -0.022883199155330658, 0.07018155604600906, 0.021149639040231705, 0.06003378704190254, 0.020026177167892456, -0.019267164170742035, 0.06961971521377563, -0.004955677315592766, -0.07218261808156967, 0.08104820549488068, -0.0418921560049057, -0.0317075252532959, 0.020996741950511932, -0.009143776260316372, 0.05348548665642738, -0.0625229999423027, -0.06267517060041428, -0.09454416483640671, -0.043331023305654526, -0.06992270052433014, -0.027888890355825424, -0.08271876722574234, -0.05188243091106415, -0.010446823202073574, 0.05846165865659714, -0.010190286673605442, -0.03009830228984356, 0.03426814824342728, -0.03598400205373764, -0.1076725572347641, -0.028831692412495613],
"FNSTCW":[-0.08537304401397705, 0.014420966617763042, 0.026950713247060776, -0.008387862704694271, -0.0038766334764659405, 0.026867343112826347, -0.030130255967378616, -0.04617878049612045, -0.007106459699571133, -0.0215947013348341, 0.007403566502034664, 0.032729458063840866, 0.0008728280663490295, -0.017559584230184555, 0.017324298620224, -0.014857987873256207, -0.03798896074295044, -0.05294371768832207, 0.05491216480731964, -0.04219334200024605, -0.024796022102236748, 0.033826109021902084, 0.04021430388092995, 0.015585671178996563, -0.025553781539201736, -0.011536196805536747, 0.021523986011743546, 0.01087264809757471, -0.023965656757354736, 0.021311553195118904, -0.0554395355284214, -9.890173532767221e-05, -0.0012819130206480622, -0.055725399404764175, 0.008443817496299744, 0.014645406976342201, 0.09493250399827957, 0.005851465743035078, -0.0346904918551445, -0.018780557438731194, -0.0024646760430186987, -0.04922417551279068, -0.025316428393125534, -0.047623440623283386, 0.04252983629703522, 0.008884137496352196, 0.024444259703159332, 0.11018849164247513, 0.06603030860424042, 0.10775407403707504, -0.06696148216724396, 0.07046543061733246, 0.03569186478853226, 0.06831049919128418, 0.10069368779659271, -0.07917457073926926, 0.07819988578557968, 0.0325605608522892, 0.028253860771656036, -0.03586380183696747, 0.08094784617424011, -0.08532348275184631, 0.08135068416595459, 0.08752897381782532, 0.07736475020647049, 0.03881741315126419, 0.01930568367242813, 0.01373430248349905, 0.07003094255924225, 0.021482432261109352, 0.0606292188167572, 0.005889599211513996, -0.06958997994661331, 0.04857232794165611, 0.09418252855539322, 0.030624384060502052, -0.05853968486189842, 0.0978643149137497, 0.042890243232250214, -0.06594833731651306, -0.00445757107809186, 0.028062766417860985, 0.04270890727639198, 0.049651019275188446, -0.10246159136295319, -0.04101993143558502, -0.06874924898147583, -0.047776881605386734, 0.060615722090005875, 0.022016024217009544, -0.0476866140961647, -0.09320542216300964, -0.06186588481068611, 0.030679777264595032, -0.01664678566157818, -0.02508559450507164, -0.0495455376803875, 0.02986457757651806, 0.0242463406175375, -0.03076062723994255],
- "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805],
"FsFLD":[-0.0508677139878273, -0.05399654433131218, -0.07149481028318405, -0.047971777617931366, 0.0019320917781442404, -0.007547610439360142, 0.0815814733505249, -0.12202084064483643, -0.08665104955434799, 0.03356856107711792, -0.15713559091091156, -0.0400867722928524, -0.006232412997633219, 0.044278621673583984, 0.09549921005964279, -0.029399411752820015, 0.01864752173423767, -0.04044967144727707, 0.05652021989226341, -0.09881851822137833, 0.025765251368284225, -0.02329906076192856, -0.06028103083372116, 0.09247462451457977, -0.04210466891527176, 0.03263019770383835, -0.03578515350818634, 0.0314578041434288, 0.003650028258562088, 0.04645871743559837, -0.010650137439370155, 0.015904754400253296, 0.018990037962794304, -0.005266033578664064, 0.038479309529066086, 0.008642041124403477, -0.049301791936159134, 0.09484748542308807, 0.005372038576751947, -0.08711376041173935, 0.07584445923566818, 0.09458201378583908, -0.00032702009775675833, 0.048093944787979126, -0.08043119311332703, 0.049779392778873444, -0.006967591121792793, -0.07319328933954239, 0.01582382619380951, -0.006244257558137178, -0.011940727941691875, -0.0013992231106385589, -0.028953444212675095, 0.010995968244969845, -0.005534093361347914, -0.04907146096229553, -0.0039899349212646484, 0.05501222237944603, 0.041574396193027496, 0.030038336291909218, -0.0402531623840332, 0.07675039023160934, 0.01103806123137474, -0.006072944961488247, -0.025336718186736107, 0.06967771798372269, -0.025075508281588554, 0.0031819106079638004, -0.015812508761882782, -0.12114851176738739, 0.07704214751720428, 0.1273191273212433, -0.014406625181436539, -0.031106390058994293, -0.0602225735783577, 0.016253838315606117, -0.059025105088949203, -0.04163780063390732, 0.01571997068822384, 0.025686416774988174, 0.032261066138744354, -0.016690189018845558, 0.014042876660823822, 0.009416786953806877, -0.012661219574511051, 0.013285082764923573, 0.03095356747508049, 0.008239349350333214, 0.0444798618555069, -0.05153216794133186, -0.010029821656644344, -0.015202880837023258, 0.06329496204853058, -0.0590473897755146, 0.08585292845964432, -0.08594027906656265, 0.06057215481996536, 0.01079416275024414, -0.04006461799144745, 0.029236430302262306],
- "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893],
"IDIV":[-0.03631015121936798, -0.07882149517536163, -0.010781447403132915, -0.025117948651313782, 0.01618420146405697, 0.044446997344493866, 0.011386583559215069, -0.00582836102694273, -0.012903614901006222, 0.006322081200778484, -0.07392880320549011, -0.1300479620695114, -0.05186808854341507, -0.06542935222387314, 0.08297666162252426, 0.03790606930851936, -0.07716395705938339, 0.02288512885570526, -0.038660015910863876, -0.04705967381596565, -0.00015759489906486124, -0.06133948266506195, -0.022438891232013702, -0.012017307803034782, 0.01929904706776142, 0.007114879786968231, 0.00567955756559968, -0.041199274361133575, 0.08304950594902039, 0.044402915984392166, -0.10634922981262207, -0.009510381147265434, 0.009772839024662971, -0.048219580203294754, -0.0321214459836483, 0.008684953674674034, 0.009846106171607971, 0.011280585080385208, 0.0310650784522295, 0.05677618831396103, 0.025418052449822426, -0.022629115730524063, 0.0074129728600382805, 0.1081111952662468, -0.03284893184900284, 0.002745774807408452, 0.05030296742916107, 0.04322626441717148, 0.005321172997355461, 0.03260405734181404, -0.051505692303180695, -0.033541131764650345, -0.03955534100532532, 0.047906432300806046, 0.02181984856724739, -0.0026405092794448137, 0.03350621834397316, -0.10710552334785461, -0.01533215120434761, -0.06872875243425369, -0.015413723886013031, -0.007149300072342157, -0.03660491481423378, -0.003503897227346897, -0.02898445539176464, 0.040071532130241394, 0.019684670493006706, -0.10101661086082458, -0.08199643343687057, 0.05637385696172714, -0.03792939707636833, 0.03106122836470604, -0.0590706542134285, -0.03607700765132904, -0.09597010910511017, -0.005815848242491484, 0.017992950975894928, 0.0007907312246970832, 0.04653536528348923, -0.03997295722365379, 0.006737773306667805, 0.11695551127195358, 0.022216010838747025, 0.041878726333379745, -0.035456813871860504, 0.04327021911740303, -0.03799387812614441, 0.10658515244722366, 0.010188632644712925, 0.09275273978710175, 0.09797771275043488, -0.12400814890861511, 0.03475511074066162, -0.08061601221561432, 0.022533612325787544, -0.11562027782201767, -0.026964085176587105, 0.08614259958267212, -0.025526022538542747, 0.040927182883024216],
"ILD_Fp":[0.01509383600205183, -0.044326793402433395, -0.051242612302303314, -0.053859174251556396, -0.013097256422042847, -0.06370041519403458, 0.06120477616786957, 0.050328709185123444, -0.04184471070766449, 0.023432370275259018, -0.06435256451368332, 0.02055867575109005, 0.08239544183015823, 0.012251744978129864, -0.05063817650079727, 0.04293346777558327, -0.05919358506798744, -0.03159564360976219, -0.0037220751401036978, -0.001002405071631074, -0.026786377653479576, -0.07405146211385727, 0.044357798993587494, 0.08067265897989273, -0.05229390412569046, -0.06903751194477081, 0.010448710061609745, 0.006885232869535685, -0.052135784178972244, 0.08535145968198776, 0.041820794343948364, -0.020588336512446404, 0.07256042212247849, -0.017755955457687378, -0.032768987119197845, 0.06633710861206055, -0.03427698463201523, -0.10930930078029633, 0.05371936410665512, -0.06794329732656479, -0.014769122004508972, -0.07577606290578842, 0.07853815704584122, -0.09360899031162262, 0.05865737050771713, -0.034065186977386475, 0.05096115916967392, 0.0888199508190155, -0.03904300555586815, 0.03125728294253349, -0.0634637326002121, 0.03385297581553459, 0.027269205078482628, -0.07597903162240982, 0.008366324007511139, -0.03017764538526535, 0.011727942153811455, -0.04941355064511299, 0.027957690879702568, 0.09743025153875351, 0.004836047999560833, -0.028614182025194168, 0.016423141583800316, 0.0895770713686943, 0.025168858468532562, 0.030979957431554794, 0.016665387898683548, 0.025412173941731453, -0.035893514752388, -0.05403519794344902, 0.02931641787290573, 0.07742571830749512, -0.07045850157737732, -0.03433118015527725, -0.03651195392012596, -0.04036823660135269, -0.08663841336965561, 0.05561026185750961, 0.06927209347486496, -0.010819001123309135, -0.10697789490222931, 0.009881369769573212, 0.055065181106328964, -0.06379911303520203, 0.04137800633907318, 0.030417418107390404, -0.03515362739562988, -0.09139228612184525, 0.029920026659965515, 0.027388064190745354, -0.06739232689142227, 0.07639766484498978, -0.044223885983228683, 0.02472294308245182, -0.052025098353624344, 0.014643780887126923, 7.120784721337259e-05, 0.018760213628411293, -0.002873474732041359, 0.015561423264443874],
"IMPLICIT_DEF":[-0.026583483442664146, -0.03995991870760918, 0.03633055090904236, -0.04622741788625717, -0.02326572686433792, 0.02231338992714882, -0.014788332395255566, -0.09906739741563797, 0.022785643115639687, -0.014632754027843475, -0.1041543111205101, 0.05013664439320564, -0.08690599352121353, -0.08063319325447083, 0.030247388407588005, -0.09707676619291306, 0.03499408811330795, 0.012669776566326618, 0.06481463462114334, -0.040453050285577774, -0.0489707849919796, -0.07584276050329208, 0.001047363504767418, 0.08496157824993134, 0.02357148937880993, -0.06866959482431412, 0.09267362207174301, 0.030527250841259956, -0.031355831772089005, 0.02419896423816681, -0.02442512847483158, 0.029297800734639168, 0.10321355611085892, 0.06579483300447464, -0.012722077779471874, 0.10042434185743332, -0.004708406049758196, 0.007217984646558762, 0.0753282904624939, -0.07088368386030197, -0.07383686304092407, 0.06410741060972214, 0.06312107294797897, 0.06989452987909317, 0.03766098991036415, -0.0008440924575552344, -0.023516006767749786, -0.04153933748602867, 0.07342316210269928, 0.05416297912597656, -0.02841850183904171, 0.04128013551235199, -0.001023625023663044, 0.005061942618340254, -0.06027042120695114, 0.025808431208133698, 0.027118714526295662, -0.08965771645307541, 0.012222534976899624, 0.008590211160480976, -0.01785023882985115, 0.03389652445912361, 0.0038459128700196743, 0.021088456735014915, -0.060241442173719406, 0.052924126386642456, -0.03849414363503456, 0.0044007860124111176, 0.05139085650444031, -0.06002991273999214, 0.026294095441699028, 0.06567239761352539, 0.1145782321691513, -0.02774081937968731, -0.07959162443876266, -0.00901349913328886, -0.09212079644203186, -0.016664501279592514, -0.019095804542303085, 0.05008011311292648, -0.016630882397294044, -0.007292845752090216, 0.01243519689887762, 0.011623953469097614, -0.0202464796602726, 0.08120717853307724, 0.04192841053009033, -0.014358888380229473, 0.0402902215719223, -0.05741799250245094, 0.0023748986423015594, -0.0007613254711031914, -0.11052780598402023, -0.08283583074808121, -0.018524790182709694, -0.09601832926273346, 0.037600427865982056, -0.06403559446334839, -0.08838459849357605, 0.01904650405049324],
@@ -75,12 +71,10 @@
"INC":[-0.04204729199409485, -0.04558457434177399, -0.004308773670345545, 0.08560862392187119, -0.025844622403383255, -0.01385454647243023, -0.06715847551822662, 0.04059276729822159, 0.0008142509614117444, -0.04987747594714165, 0.05252164602279663, -0.07536070048809052, 0.012251293286681175, -0.01428443193435669, 0.028742481023073196, -0.024608345702290535, 0.009724774397909641, -0.024144234135746956, -0.04345421493053436, -0.03454094007611275, -0.03657921776175499, -0.025569358840584755, 0.04140102490782738, -0.02267373353242874, -0.05346262827515602, -0.07470668852329254, -0.03458420932292938, -0.015982985496520996, 0.013558092527091503, -0.029305797070264816, 0.026653757318854332, -0.00041234202217310667, 0.038508299738168716, 0.08509717136621475, 0.0016276738606393337, -0.013578594662249088, 0.05669381096959114, 0.0274334829300642, 0.023921431973576546, -0.02701006643474102, -0.09357035905122757, 0.07844959199428558, -0.03195708245038986, 0.044196177273988724, 0.017355425283312798, -0.04172753170132637, -0.07773707062005997, 0.018204662948846817, -0.07242465019226074, 0.07735569030046463, 0.03859752044081688, 0.08490721136331558, 0.04661087319254875, 0.015468046069145203, 0.02267235703766346, -0.030244702473282814, -0.043930262327194214, -0.015585970133543015, -0.004605699330568314, 0.0052457586862146854, -0.027553195133805275, -0.06406774371862411, 0.008009923622012138, -0.09624558687210083, 0.07006736844778061, 0.052846722304821014, -0.029392898082733154, -0.0659954622387886, -0.10725440829992294, 0.04428407922387123, 0.02606845460832119, 0.018936248496174812, -0.013534934259951115, 0.03338829427957535, -0.06049540638923645, 0.007389454171061516, 0.030835872516036034, -0.026952944695949554, -0.008518273010849953, 0.07688802480697632, 0.03663042560219765, -0.09961165487766266, -0.02765841968357563, 0.06263019144535065, -0.003026304766535759, -0.0023868512362241745, -0.052803706377744675, 0.04688272252678871, 0.08415349572896957, -0.044724639505147934, 0.01759890839457512, 0.022962408140301704, 0.00944716576486826, -0.084384024143219, -0.02845100499689579, -0.05094959959387779, -0.08001884073019028, 0.0449872724711895, -0.05161838233470917, 0.015422065742313862],
"INLINEASM":[0.09296883642673492, -0.007579821161925793, 0.05054628103971481, 0.0011402746895328164, -0.02369365282356739, -0.040429845452308655, 0.048763860017061234, -0.012725423090159893, -0.017820369452238083, -0.0700153335928917, -0.00037883210461586714, 0.06301063299179077, 0.0503254272043705, 0.023893356323242188, -0.07308998703956604, 0.058056626468896866, -0.002504807896912098, -0.03528450429439545, -0.0775352418422699, -0.08423604816198349, 0.01841139607131481, 0.07128658145666122, 0.01363592129200697, 0.05391324311494827, 0.04803359508514404, 0.06145099550485611, -0.03153276443481445, 0.019207997247576714, 0.07138897478580475, 0.06972941011190414, 0.06482893973588943, -0.019937975332140923, -0.00694684125483036, 0.0624234639108181, 0.08495642989873886, 0.017590269446372986, -0.0075670769438147545, 0.05114367976784706, 0.031221428886055946, -0.07108655571937561, -0.018287384882569313, 0.035706836730241776, -0.0794610008597374, -0.03627452626824379, -0.06174106150865555, -0.036826081573963165, -0.030408767983317375, 0.008271732367575169, -0.09423738718032837, 0.004248321522027254, 6.044749170541763e-05, 0.011095447465777397, -0.10245273262262344, -0.07278212904930115, -0.00845671258866787, 0.008961541578173637, 0.019341865554451942, 0.010205359198153019, 0.0724569708108902, -0.08050914853811264, -0.057010360062122345, 0.05053231865167618, -0.04844024032354355, 0.057458631694316864, 0.007486356887966394, -0.029497744515538216, 0.009812748059630394, -0.05314056575298309, 0.11012034863233566, -0.0647352784872055, 0.017479702830314636, -0.027027146890759468, -0.015448061749339104, 0.06321517378091812, -0.06948030740022659, 0.030430838465690613, -0.022251488640904427, -0.0358838327229023, 0.020705783739686012, -0.10970951616764069, -0.07724311202764511, 0.03224516287446022, 0.004828427918255329, 0.07738938182592392, -0.0036471053026616573, 0.06867322325706482, -0.07092054188251495, -0.024759342893958092, -0.054835252463817596, 0.019259851425886154, 0.011149682104587555, -0.09652992337942123, 0.050764426589012146, -0.0809553936123848, -0.04605351760983467, 0.0399462915956974, 0.05396333709359169, -0.01706104166805744, -0.031266387552022934, 0.020599452778697014],
"IST_Fp":[-0.046584248542785645, -0.07452045381069183, 0.03998621925711632, 0.03091888502240181, 0.016272397711873055, -0.00985297653824091, -0.007199955638498068, -0.03536335751414299, 0.01673988439142704, 0.07562774419784546, 0.023876583203673363, -0.008683494292199612, 0.04009688273072243, -0.03663905709981918, -0.014492983929812908, 0.07349997758865356, 0.028999919071793556, -0.07499339431524277, -0.03727814927697182, -0.046455491334199905, -0.032447993755340576, 0.02374599315226078, -0.044662121683359146, -0.025333719328045845, 0.037562429904937744, 0.0006656686891801655, -0.00804421491920948, 0.06697870045900345, 0.04367857426404953, -0.0583018884062767, 0.03050180710852146, 0.053111929446458817, -0.04168881103396416, -0.027295507490634918, 0.057777389883995056, 0.08833678811788559, -0.026598922908306122, 0.005393106956034899, -0.05517015606164932, -0.0731138065457344, 0.07386088371276855, -0.07228095829486847, 0.023828018456697464, -0.0025013380218297243, -0.012031037360429764, 0.029700662940740585, -0.101964570581913, 0.0899822935461998, 0.013285316526889801, 0.002607472240924835, 0.04784732311964035, -0.044669900089502335, -0.04348702356219292, -0.07007527351379395, -0.016267215833067894, 0.059609103947877884, -0.036534957587718964, 0.013465121388435364, 0.10186120122671127, 0.015473871491849422, -0.08443709462881088, -0.004981503821909428, 0.06996916979551315, 0.011159068904817104, -0.07315052300691605, 0.024891534820199013, 0.0426689088344574, 0.008847315795719624, -0.06540054082870483, -0.09095568209886551, 0.053956128656864166, -0.010535894893109798, 0.035168495029211044, 0.04921877756714821, -0.07781729847192764, 0.006958760786801577, -0.05714801698923111, -0.06458019465208054, -0.055241748690605164, -0.007552466355264187, -0.02490214817225933, -0.014270482584834099, 0.03710750862956047, 0.003406278323382139, -0.044638775289058685, -0.09159127622842789, -0.025353819131851196, -0.07952282577753067, -0.02874225378036499, -0.06654132902622223, 0.0031955954618752003, 0.0602104589343071, -0.09261002391576767, -0.06175351142883301, 0.01194009743630886, -0.0348934531211853, 0.04460763558745384, -0.08773446083068848, 0.04335169121623039, 0.054603610187768936],
- "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983],
"Int_MemBarrier":[0.0418969988822937, -0.06285926699638367, -0.018717624247074127, -0.0031687396112829447, 0.04023218899965286, 0.08492552489042282, -0.06942103803157806, 0.005588027182966471, -0.08964942395687103, 0.055396437644958496, -0.06732998788356781, 0.06981600075960159, -0.05258888751268387, -0.06051918491721153, 0.02948639541864395, -0.04473342001438141, 0.01574157550930977, -0.04423875734210014, -0.053338322788476944, 0.008577392436563969, 0.10632415115833282, 0.040030092000961304, 0.02552260458469391, 0.026821544393897057, -0.05510386824607849, 0.05976655334234238, -0.0008300095796585083, 0.06861157715320587, -0.049591872841119766, -0.07650840282440186, -0.004643433261662722, -0.03990425914525986, 0.06366871297359467, -0.014906020835042, -0.06371121108531952, 0.0194997675716877, -0.07784571498632431, 0.029953552410006523, 0.06530797481536865, -0.09173597395420074, 0.021494632586836815, 0.052978403866291046, -0.001283245743252337, -0.05061378329992294, 0.04639996960759163, 0.06478390842676163, -0.015909312292933464, 0.013739313930273056, -0.06675873696804047, -0.0704226866364479, 0.020883914083242416, 0.07323179394006729, -0.0010066484101116657, -0.002373248105868697, -0.07056596130132675, 0.024577656760811806, 0.04880139231681824, -0.038608577102422714, 0.07695038616657257, 0.002806240925565362, 0.006876204162836075, 0.006961337756365538, 0.059363361448049545, 0.021191507577896118, -0.06366844475269318, -0.015020458959043026, -0.0815785601735115, 0.004222068004310131, -0.07691111415624619, 0.02711009606719017, 0.014720573090016842, 0.022912023589015007, 0.05272422730922699, 0.08111070841550827, -0.018083568662405014, -0.0418405644595623, 0.08496879786252975, -0.04420621693134308, 0.090696781873703, -0.02872851863503456, -0.024066468700766563, 0.07789512723684311, -0.012021118775010109, 0.041637614369392395, 0.07615016400814056, -0.042834896594285965, 0.05792360380291939, -0.051077719777822495, -0.05241186544299126, 0.006270663347095251, -0.008865885436534882, -0.09101007878780365, 0.009276151657104492, 0.036050815135240555, -0.06729964166879654, -0.014552133157849312, -0.06943532824516296, -0.023805340752005577, -0.058313168585300446, -0.04949163272976875],
"JCC_":[-0.03625413775444031, -0.041811503469944, -0.07486920803785324, -0.05052778869867325, 0.021635157987475395, -0.045879144221544266, 0.014834613539278507, -0.03941917419433594, -0.010327291674911976, -0.08194752782583237, 0.049111511558294296, 0.05970187485218048, 0.03878019377589226, -0.08208157867193222, 0.11816514283418655, -0.0021148237865418196, 0.022616155445575714, 0.02145639806985855, -0.056387223303318024, -0.07890307158231735, 0.049655016511678696, -0.09555239230394363, -0.07599814981222153, 0.04143097624182701, -0.029399001970887184, 0.01379090640693903, 0.04894237220287323, 0.04915700852870941, 0.020924754440784454, 0.11983200162649155, -0.045743830502033234, 0.04826069250702858, 0.06473162770271301, 0.032176557928323746, 0.012342192232608795, 0.03632035106420517, -0.011231182143092155, 0.03319219872355461, 0.012383898720145226, 0.017726020887494087, -0.027707353234291077, 0.052987076342105865, -0.06459034234285355, 0.03180805966258049, 0.038370322436094284, -0.018640436232089996, -0.05121193453669548, -0.052741218358278275, 0.0953487753868103, 0.0914265364408493, 0.08409767597913742, -0.009599939920008183, 0.02045055478811264, 0.009363643825054169, -0.00872961338609457, -0.08178623765707016, -0.008178372867405415, -0.005903102457523346, 0.05836755037307739, 0.011602274142205715, -0.02761419117450714, 0.016957316547632217, 0.04471946507692337, 0.005247261840850115, -0.05416998639702797, 0.00770663283765316, -0.06152857095003128, 0.021657155826687813, -0.04485960677266121, -0.0008541923016309738, 0.053551655262708664, 0.062185727059841156, -0.012641278095543385, -0.020507624372839928, -0.02900690771639347, 0.019629495218396187, 0.05620177462697029, -0.07772354781627655, -0.025509009137749672, 0.01923682540655136, 0.03035508468747139, 0.018665296956896782, 0.013450516387820244, 0.06740278005599976, 0.013274379074573517, 0.011593983508646488, 0.02331095188856125, 0.048694003373384476, 0.05861792340874672, -0.021130137145519257, 0.02437412552535534, 0.059087324887514114, 0.024816056713461876, -0.050772879272699356, -0.01114521361887455, -0.028665395453572273, -0.09630053490400314, 0.0039062038995325565, -0.08236120641231537, 0.019473683089017868],
"JMP":[-0.021766331046819687, -0.021576769649982452, -0.03795000538229942, 0.10449998080730438, -0.037742577493190765, -0.009156269021332264, 0.015289359726011753, 0.03519408404827118, -0.034353505820035934, 0.03226960077881813, 0.07340928167104721, 0.06086661294102669, 0.05736850947141647, -0.01725650765001774, -0.06702736765146255, -0.014972181059420109, 0.03435607627034187, 0.012023050338029861, 0.03370668366551399, -0.022338073700666428, -0.08280093967914581, -0.08060947060585022, 0.012210523709654808, -0.08165933936834335, 0.0016056479653343558, 0.015586943365633488, 0.11792927235364914, 0.06917431950569153, 0.02870137430727482, 0.01961304247379303, -0.027661900967359543, 0.10504695773124695, -0.03640349581837654, -0.01896090805530548, -0.011636625044047832, -0.04474593698978424, -0.029941411688923836, -0.058342345058918, 0.05885041877627373, 0.05553867667913437, 0.03953809291124344, 0.06787443161010742, 0.002061075298115611, 0.027305128052830696, 0.05792280659079552, 0.08001891523599625, 0.026575665920972824, -0.0171738862991333, -0.010685772635042667, -0.05422135442495346, 0.03660969436168671, 0.03091355785727501, -0.05900857225060463, 0.08500046283006668, -0.08218419551849365, 0.061078935861587524, 0.018783383071422577, 0.047520000487565994, -0.00014930205361451954, 0.002577823819592595, -0.06816059350967407, 0.041743114590644836, 0.03372296690940857, 0.016127480193972588, -0.07235685735940933, 0.024466760456562042, -0.03468412905931473, 0.037008773535490036, -0.060657840222120285, 0.016427740454673767, 0.08229042589664459, -0.061172664165496826, -0.009794612415134907, -0.024358782917261124, -0.06573519110679626, 0.09360098838806152, -0.07428182661533356, -0.02529928646981716, 0.09198813885450363, 0.025180503726005554, 0.03200048953294754, 0.018081925809383392, 0.0034776402171701193, 0.07848992198705673, -0.00043209362775087357, -0.01768604852259159, -0.043686315417289734, 0.04550321400165558, 0.11878672987222672, -0.008190528489649296, 0.003286525374278426, 0.06845948845148087, 0.04892893135547638, -0.053277406841516495, -0.016919657588005066, 0.032096169888973236, 0.02839065156877041, -0.01713993400335312, -0.15167304873466492, -0.02013365738093853],
"JMP_":[-0.014233234338462353, -0.0260892603546381, -0.13750334084033966, -0.050227466970682144, -0.042988359928131104, -0.027947310358285904, 0.08639533072710037, -0.16317786276340485, -0.03907149285078049, -0.05328908935189247, -0.03975899517536163, 0.04182944446802139, -0.010540750809013844, -0.11645861715078354, -0.012753792107105255, 0.002367585664615035, -0.05188040807843208, 0.0033823091071099043, -0.01240340806543827, -0.06099176034331322, -0.0015601427294313908, -0.11171454191207886, -0.04928319901227951, 0.05990544706583023, 0.015553089790046215, 0.04499414563179016, -0.034520961344242096, 0.07318194955587387, 0.013978325761854649, 0.07317976653575897, -0.029100794345140457, -0.09544635564088821, 0.030067358165979385, 0.057544808834791183, 0.005057932808995247, 0.005621553864330053, -0.03627946600317955, -0.0391962006688118, 0.03113878332078457, -0.02958066016435623, 0.012381716631352901, 0.011978821828961372, 0.13839371502399445, 0.010590317659080029, 0.06677765399217606, 0.046147286891937256, -0.05033441260457039, -0.020135121420025826, 0.032657306641340256, -0.05044032260775566, 0.05499301478266716, 0.07406507432460785, -0.0011679750168696046, 0.000989275984466076, 0.029161963611841202, 0.02679276280105114, 0.024040302261710167, 0.0710899606347084, -0.0035478041972965, -0.03730632737278938, -0.014350024051964283, 0.1638166308403015, -0.10163120925426483, 0.02900329977273941, -0.05366139113903046, 0.07186686992645264, -0.041340481489896774, 0.0401119627058506, -0.002295189071446657, -0.07949572801589966, -0.011504769325256348, 0.10675538331270218, -0.012056156061589718, -0.00748586468398571, -0.039624687284231186, -0.03555607795715332, -0.06799864768981934, -0.04550764709711075, -0.03302829712629318, -0.008404256775975227, 0.10563746094703674, -0.026095328852534294, 0.07613116502761841, 0.02101682499051094, 0.018749620765447617, 0.0056787943467497826, 0.005889789201319218, 0.03994893655180931, 0.05512934923171997, -0.004684743471443653, -0.01083239447325468, 0.0003112686099484563, 0.024348445236682892, -0.02665846049785614, 0.0064091463573277, -0.02719639055430889, 0.11076066642999649, -0.0014569570776075125, 0.0050220787525177, 0.032427236437797546],
- "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772],
"LCMPXCHG":[0.0649508610367775, -0.04321656376123428, 0.08405561745166779, -0.07786691188812256, -0.05277935788035393, 0.011031142435967922, -0.0015533932019025087, 0.08730415254831314, -0.004414519295096397, 0.04040057212114334, -0.005748671013861895, -0.013907546177506447, 0.1028006374835968, 0.09900037944316864, -0.06475479900836945, 0.024365412071347237, -0.0727076306939125, 0.06610138714313507, -0.026073187589645386, 0.08258920162916183, -0.007938066497445107, 0.07641425728797913, 0.10221290588378906, 0.029036179184913635, -0.024506229907274246, 0.00953623466193676, -0.03283938392996788, -0.07194274663925171, -0.023513879626989365, -0.017550935968756676, -0.037860531359910965, 0.042062658816576004, 0.0501263290643692, 0.02325640618801117, 0.0018605751683935523, 0.012687316164374352, -0.016979143023490906, -0.059858907014131546, -0.07078705728054047, 0.033630695194005966, 0.036799900233745575, -0.03821465000510216, -0.059619177132844925, -0.06309511512517929, 0.0019384543411433697, -0.053095221519470215, 0.00571654736995697, 0.07134073972702026, -0.02115899883210659, 0.021287376061081886, -0.04855392873287201, 0.0103003466501832, -0.008993818424642086, 0.05131004378199577, -0.0734843909740448, 0.017303360626101494, 0.008291462436318398, 0.046435531228780746, -0.055057018995285034, -0.05454597249627113, -0.009126733057200909, -0.0012434959644451737, -0.0846821740269661, -0.017736544832587242, -0.04779898375272751, 0.020568806678056717, -0.061118245124816895, -0.012131555937230587, 0.024907736107707024, -0.0161012914031744, -0.011221951805055141, -0.029136324301362038, 0.04336633160710335, -0.00514700124040246, 0.004810850135982037, 0.014044326730072498, -0.07381691038608551, -0.064864382147789, 0.041784100234508514, 0.06648915261030197, 0.038817185908555984, -0.03421948850154877, 0.019546108320355415, -0.00579161336645484, -0.06579872220754623, -0.01745537295937538, -0.07164284586906433, 0.032588109374046326, 0.009170422330498695, -0.08387100696563721, -0.04743993282318115, 0.05926872417330742, 0.03129392862319946, -0.012995549477636814, 0.007799868006259203, 0.036110181361436844, 0.01603531278669834, -0.09735894203186035, 0.014374110847711563, -0.023844046518206596],
"LD_Fp":[0.09850919246673584, 0.022097617387771606, -0.02880568616092205, 0.014175659976899624, -0.03401500731706619, -0.010281442664563656, -0.05501694604754448, -0.041856300085783005, 0.07016798853874207, -0.022585496306419373, -0.007230871357023716, 0.02143889106810093, 0.011802875436842442, -0.011940510012209415, 0.001225354615598917, -0.04420488327741623, 0.058923713862895966, 0.07726655155420303, -0.024950502440333366, -0.005545462481677532, 0.037338823080062866, -0.03718772903084755, 0.08340831100940704, 0.030300375074148178, -0.04332158342003822, -0.10117480903863907, -0.023774733766913414, 0.055412717163562775, 0.07188894599676132, 0.048699796199798584, 0.02051064558327198, -0.05177381634712219, 0.046848755329847336, 0.06421937793493271, 0.014812597073614597, 0.06599052250385284, 0.055128950625658035, 0.057206105440855026, 0.004570540506392717, 0.0006673894240520895, -0.04956628009676933, 0.018173960968852043, 0.009045585989952087, -0.09929032623767853, -0.0734606683254242, 0.009978558868169785, 0.016378602012991905, -0.0809779167175293, 0.028371425345540047, 0.07337132841348648, -0.0712965577840805, -0.07612331956624985, 0.023224541917443275, -0.01886812597513199, 0.049867402762174606, 0.04525093734264374, -0.04347287490963936, 0.04647829011082649, -0.020921878516674042, 0.055911704897880554, 0.0646883100271225, 0.043256886303424835, 0.012135359458625317, 0.06405725330114365, 0.04327752813696861, -0.06879010051488876, -0.02182726003229618, -0.030435195192694664, -0.04794333875179291, 0.03966866061091423, -0.05612926930189133, 0.061092350631952286, -0.047390542924404144, 0.06440525501966476, 0.07119303941726685, 0.036672186106443405, 0.039346762001514435, 0.05825766921043396, -0.05363740026950836, 0.026515239849686623, -0.021117106080055237, -0.061990927904844284, 0.06407181918621063, -0.02918284200131893, 0.06280291080474854, 0.05465791001915932, 0.025043612346053123, -0.015093226917088032, 0.0339696891605854, 0.039516378194093704, -0.005943501368165016, 0.037065502256155014, 0.0036617075093090534, -0.04032375290989876, -0.027956390753388405, -0.028206538408994675, 0.003602939657866955, 0.0015424611046910286, 0.03779160603880882, -0.012583530507981777],
"LEA":[-0.07203060388565063, -0.017553633078932762, 0.0402604416012764, -0.03958871215581894, -0.035693515092134476, 0.006020952947437763, 0.06661038845777512, -0.05565638095140457, -0.07512512803077698, 0.015386131592094898, 0.1531272977590561, 0.07126382738351822, -0.018143991008400917, 0.0798688530921936, -0.0836813896894455, -0.005903773941099644, -0.03920849785208702, 0.025672506541013718, -0.017640162259340286, -0.09243063628673553, 0.0272371768951416, 0.04267166927456856, -0.032052017748355865, 0.06952647119760513, -0.03414658084511757, 0.05041181296110153, 0.04035321623086929, 0.04639449715614319, -0.000271787925157696, 0.1057962104678154, -0.031690120697021484, 0.0785541757941246, -0.008634688332676888, 0.035989925265312195, -0.00988205149769783, -0.047323428094387054, -0.018978994339704514, -0.001277003320865333, -0.022872451692819595, -0.034365635365247726, -0.04628191888332367, 0.06221615523099899, 0.01957613043487072, 0.13219280540943146, 0.03662179410457611, 0.046082716435194016, 0.011469600722193718, -0.025702660903334618, -0.08428508788347244, -0.07941769808530807, -0.06742636859416962, 0.0873873308300972, 0.0038614647928625345, 0.02177446149289608, -0.004519546404480934, -0.06213155761361122, -0.011228920891880989, -0.12034870684146881, 0.008946738205850124, 0.009164049290120602, -0.02258075587451458, 0.016061170026659966, 0.0645158663392067, 0.03723616153001785, -0.06451661139726639, -0.005219440441578627, -0.055180057883262634, 0.015841009095311165, -0.01621314138174057, -0.09887613356113434, 0.04894544556736946, -0.07996354252099991, 0.0138346366584301, -0.04036646708846092, -0.07073907554149628, -0.019294722005724907, -0.08181063830852509, -0.002301511587575078, -0.03429428115487099, 0.04098176211118698, 0.0706806555390358, 0.020024126395583153, 0.043529968708753586, 0.060017164796590805, 0.003525135340169072, -0.029752371832728386, -0.021769365295767784, 0.03941021487116814, -0.002250884659588337, -0.08078912645578384, 0.015297000296413898, 0.026888463646173477, 0.048139896243810654, -0.04837239161133766, -0.036249756813049316, -0.027615496888756752, -0.15165935456752777, -0.03756902739405632, 0.015112340450286865, -0.0010633820202201605],
@@ -89,13 +83,10 @@
"LXADD":[-0.11344388872385025, 0.08068472892045975, -0.041796449571847916, -0.043138183653354645, -0.049067553132772446, -0.005337296053767204, 0.021436110138893127, -0.035862036049366, -0.05354782193899155, 0.007918866351246834, -0.033625587821006775, 0.048349399119615555, 0.07167208194732666, -0.04589017853140831, -0.023661522194743156, 0.03580676391720772, 0.03326055034995079, 0.041535746306180954, -0.008772681467235088, -0.03362834453582764, -0.008885134011507034, -0.005286931060254574, -0.09389151632785797, 0.015108847059309483, -0.020455803722143173, 0.06477829068899155, 0.012845957651734352, -0.03201524540781975, -0.07100234925746918, 0.046879976987838745, -0.06030888110399246, 0.022502053529024124, -0.10942362248897552, -0.06978410482406616, 0.0714743509888649, 0.057766277343034744, 0.038102924823760986, -0.007761931978166103, -0.11331900954246521, -0.07498679310083389, -0.002573479898273945, -0.005142265930771828, -0.04596858471632004, -0.05356051027774811, 0.10633396357297897, -0.07426618784666061, 0.037482988089323044, 0.10527358204126358, 0.08239476382732391, 0.0678592249751091, -0.014271541498601437, -0.010673552751541138, -0.0767236202955246, 0.0329856239259243, -0.02222914807498455, -0.0019944666419178247, -0.0789676085114479, 0.006855306681245565, -0.012843947857618332, -0.10197136551141739, -0.036981865763664246, 0.04500154033303261, 0.0023044694680720568, -0.0031417198479175568, -0.06536462903022766, -0.02773689292371273, 0.06672050058841705, 0.046953968703746796, 0.009028433822095394, -0.008872197940945625, 0.09054717421531677, 0.009121377021074295, 0.09400534629821777, 0.012045130133628845, -0.014854185283184052, 0.030989984050393105, -0.030203191563487053, 0.09275887161493301, -0.009853487834334373, 0.038435857743024826, 0.05689401552081108, -0.06919367611408234, -0.02360834926366806, -0.08338318765163422, 0.01904873177409172, -0.027271559461951256, -0.05529508367180824, 0.09507890790700912, -0.03128642588853836, 0.026687508448958397, -0.05117009952664375, -0.03872146084904671, 0.08641110360622406, -0.027542488649487495, -0.09849996864795685, 0.05740527808666229, -0.02291804924607277, -0.10829142481088638, 0.008436905220150948, 0.027438905090093613],
"MAXSDrr":[-0.06119297072291374, -0.04124095290899277, -0.0296846404671669, -0.045824289321899414, 0.02508155070245266, 0.007925539277493954, 0.043926920741796494, -0.03159729018807411, 0.019068658351898193, -0.013711963780224323, -0.028986897319555283, -0.04561398923397064, 0.04851536825299263, -0.03764308616518974, -0.018207892775535583, 0.016173269599676132, -0.004123492166399956, -0.025343073531985283, -0.09777097404003143, 0.0290510356426239, -0.06969164311885834, -0.06684337556362152, 0.04377250373363495, 0.06861237436532974, -0.046966683119535446, 0.0611143596470356, -0.044503044337034225, 0.023559842258691788, -0.029876690357923508, 0.011016200296580791, 0.07286348938941956, 0.00030023325234651566, 0.08359035104513168, 0.017708808183670044, 0.07800529897212982, -0.08712167292833328, 0.002862636698409915, -0.06735634058713913, 0.03052128478884697, 0.04226242005825043, 0.023851098492741585, -0.04562359303236008, -0.013745550066232681, 0.013936172239482403, -0.0647776871919632, -0.0487772636115551, 0.07015536725521088, -0.030445875599980354, -0.043143901973962784, -0.09556057304143906, 0.047779254615306854, 0.046041958034038544, 0.009388554841279984, 0.04671555384993553, -0.059331271797418594, 0.03360891714692116, 0.03569460287690163, 0.004674405790865421, 0.03280949592590332, -0.011293579824268818, -0.05531742051243782, 0.045912306755781174, 0.04241438955068588, -0.07023770362138748, -0.03889290615916252, 0.019566599279642105, 0.06292827427387238, -0.012180106714367867, -0.009482266381382942, 0.0033363515976816416, -0.028241898864507675, 0.04916750639677048, -0.011430651880800724, 0.05025538429617882, 0.02134493552148342, 0.04370661824941635, 0.08801361173391342, -0.04115797579288483, -0.06421534717082977, -0.051845721900463104, -0.041304778307676315, 0.0507316067814827, 0.049301628023386, -0.013558737933635712, -0.004291698802262545, 0.038709867745637894, -0.0636303573846817, -0.047141704708337784, 0.022303685545921326, 0.07054309546947479, 0.009679436683654785, 0.0638614296913147, -0.046838339418172836, 0.01595005951821804, -0.025526082143187523, -0.0818924531340599, 0.016986405476927757, 0.023154381662607193, 0.06338698416948318, 0.07277237623929977],
"MAXSSrr":[0.04370328411459923, 0.007435579318553209, 0.05632773041725159, 0.05872607231140137, -0.02179848775267601, -0.02491024136543274, -0.09028499573469162, -0.073136106133461, 0.0038046056870371103, -0.004702121019363403, 0.06376311928033829, 0.025374436751008034, 0.03343794494867325, -0.03841162100434303, 0.04050759971141815, 0.06359805166721344, -0.05459776520729065, -0.013898322358727455, 0.043059010058641434, 0.008913826197385788, -0.08469206839799881, 0.07041019201278687, -0.08591683208942413, 0.001833248999901116, 0.07940677553415298, -0.025694575160741806, -0.07197162508964539, -0.017312491312623024, -0.037606846541166306, -0.024861449375748634, 0.024707462638616562, -0.00026734761195257306, 0.033847302198410034, 0.05927937477827072, 0.04899705946445465, 0.0770091861486435, -0.09790053963661194, 0.057826053351163864, 0.05768071860074997, 0.01531772967427969, 0.0404951311647892, -0.04033346846699715, 0.05936214700341225, -0.029121382161974907, 0.044257547706365585, -0.10413498431444168, 0.09214437007904053, 0.017709942534565926, 0.026122651994228363, -0.08045665174722672, -0.03744427487254143, 0.09111800789833069, 0.0020880592055618763, 0.07745599746704102, 0.04109589755535126, -0.07718705385923386, -0.045550283044576645, 0.06791391223669052, 0.06261736899614334, -0.04795467481017113, 0.016496436670422554, 0.02853775955736637, -0.038986679166555405, 0.012603304348886013, 0.05299075320363045, 0.0022748250048607588, 0.00884503684937954, 0.1081618219614029, 0.05347983166575432, 0.03069908171892166, 0.015294212847948074, 0.0618034303188324, -0.07555301487445831, -0.0897526815533638, -0.07293840497732162, -0.02863491326570511, 0.01548877265304327, 0.09115951508283615, 0.011775748804211617, -0.009436656720936298, -0.07188120484352112, -0.004493236541748047, 0.0661926344037056, -0.04905804619193077, -0.06685564666986465, 0.06110713630914688, 0.018521195277571678, -0.04577818885445595, 0.07256703823804855, 0.0831693485379219, 0.008730655536055565, 0.04827301949262619, 0.0754026547074318, 0.027548737823963165, -0.07210569083690643, -0.004550515208393335, -0.06998797506093979, -0.014580612070858479, -0.04511459916830063, 0.1119980439543724],
- "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257],
- "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583],
"MINSDrr":[0.00284420233219862, 0.07673676311969757, 0.08602232486009598, 0.030074521899223328, -0.06255929172039032, -0.10135219246149063, 0.0772649347782135, 0.0045582992024719715, -0.01195931900292635, 0.009085145778954029, -0.04665979743003845, 0.019213048741221428, 0.022454556077718735, -0.05505772680044174, 0.035268958657979965, -0.06431140005588531, -0.001450810581445694, -0.027346337214112282, 0.041191086173057556, -0.0808955729007721, -0.04748200997710228, 0.0653977245092392, 0.042980875819921494, -0.04332194849848747, -0.024661004543304443, 0.09317019581794739, -0.06639514118432999, 0.013383567333221436, 0.051771167665719986, 0.05815904587507248, -0.05226780101656914, 0.079694002866745, -0.017969269305467606, -0.07137028127908707, -0.0011493286583572626, -0.02009846828877926, 0.006549016106873751, 0.0019126685801893473, 0.06168307736515999, -0.025323089212179184, 0.010943768545985222, 0.02157585136592388, -0.012993190437555313, -0.025179127231240273, -0.08958654850721359, -0.04273540899157524, 0.015248515643179417, 0.05456075817346573, 0.05705633386969566, -0.0038763433694839478, 0.08008016645908356, -0.004114328417927027, -0.01975642889738083, -0.014040309935808182, 0.025527596473693848, -0.06883629411458969, 0.06273050606250763, 0.05779215693473816, -0.061573851853609085, 0.01889919489622116, 0.026195447891950607, -0.021544434130191803, -0.0810774490237236, -0.016286203637719154, 0.01799311302602291, -0.08440321683883667, 0.0897485539317131, 0.08083964139223099, -0.006629236973822117, 0.051063962280750275, -0.08597207814455032, 0.029692046344280243, -0.03309508413076401, -0.09422174096107483, 0.0019163102842867374, 0.05546015128493309, -0.05980079993605614, -0.07416199892759323, -0.005134278908371925, 0.07392455637454987, -0.0634748563170433, 0.020546387881040573, -0.019978882744908333, 0.039572179317474365, -0.04754075035452843, -0.06090293824672699, -0.011185224168002605, -0.054661743342876434, 0.027916360646486282, -0.00819246843457222, -0.03119322657585144, 0.019949961453676224, 0.008312772959470749, 0.06788603216409683, 0.041624777019023895, 0.051687415689229965, -0.04819793254137039, -0.0761520192027092, -0.019374510273337364, -0.008435340598225594],
"MINSSrr":[-0.06906168162822723, 0.008121289312839508, 0.010413543321192265, 0.052863992750644684, 0.01030051801353693, -0.009280139580368996, 0.016139337792992592, -0.05126945674419403, 0.06733083724975586, -0.01006366591900587, 0.06506948918104172, 0.05012301355600357, -0.07191506028175354, 0.018038516864180565, -0.020798280835151672, 0.08538958430290222, -0.028427604585886, 0.02630189247429371, 0.010489841923117638, 0.10011959075927734, -0.067482590675354, 0.01461686473339796, 0.03908747434616089, -0.015383233316242695, -0.03783239424228668, 0.06359098851680756, -0.052475571632385254, 0.07818790525197983, -0.0030931381043046713, 0.013684416189789772, 0.04222726821899414, 0.04708671569824219, 0.01192860770970583, 0.08628913760185242, -0.06380248814821243, -0.004006511997431517, -0.02817981317639351, -0.11196613311767578, 0.01953534409403801, 0.0034300305414944887, -0.040240559726953506, 0.004963779356330633, -0.06623393297195435, -0.04386508837342262, -0.08431598544120789, -0.023293999955058098, 0.02133636176586151, 0.04054516181349754, 0.04479363188147545, 0.02776535600423813, 0.01497643906623125, 0.026148531585931778, -0.05869835242629051, -0.07451415807008743, -0.009552933275699615, -0.004124804865568876, 0.08342882245779037, 0.05295371264219284, -0.05495591461658478, -0.07350015640258789, -0.05573306977748871, 0.07158630341291428, 0.04162517935037613, 0.0019162269309163094, -0.07742705941200256, -0.05673951655626297, 0.05760834366083145, 0.08143799751996994, 0.09629082679748535, -0.05737840384244919, 0.03762679174542427, 0.022383252158761024, 0.02897579036653042, -0.0929567888379097, 0.04767351970076561, 0.05145186930894852, 0.012956425547599792, 0.04237693175673485, 0.06772835552692413, 0.011290902271866798, -0.06324069201946259, -0.04689439386129379, 0.09521757066249847, 0.05625065788626671, -0.032533977180719376, -0.00987032800912857, -0.08346299827098846, -0.06292857229709625, 0.042861636728048325, 0.08865208923816681, -0.0021774298511445522, 0.010668188333511353, -0.05791740491986275, 0.02240762859582901, -0.022414017468690872, 0.04343479871749878, 0.01852354407310486, -0.004329795949161053, -0.00262851663865149, -0.009029376320540905],
"MOV":[-0.03924819082021713, -0.015029003843665123, 0.14121688902378082, -0.05414531007409096, -0.01409768033772707, 0.05467522144317627, -0.0798286497592926, 0.042834796011447906, -0.04328306391835213, -0.12638653814792633, 0.02380293421447277, -0.010002975352108479, -0.03018246777355671, -0.09843093156814575, -0.015159506350755692, -0.03186051547527313, -0.009830419905483723, 0.024049948900938034, -0.028536750003695488, -0.05252794921398163, -0.003984724637120962, -0.09075328707695007, -0.015937313437461853, 0.07316069304943085, 0.002778300317004323, 0.003214895725250244, -0.0832214206457138, 0.012602301314473152, 0.0687694102525711, 0.1425037384033203, -0.04724106192588806, 0.05618143081665039, 0.0028424363117665052, 0.03067261539399624, 0.008477674797177315, -0.002142940880730748, 0.0036045191809535027, -0.02257452718913555, 0.013552851043641567, -0.016065331175923347, 0.03364546224474907, 0.0027604023925960064, -0.013575572520494461, 0.1340155154466629, 0.04859570413827896, 0.07984673976898193, 0.006813493091613054, -0.017625009641051292, -0.0667564794421196, -0.0025298972614109516, -0.06280945241451263, 0.08589767664670944, -0.011751428246498108, 0.04074618220329285, 0.0561428964138031, -0.0068444423377513885, 0.028041694313287735, 0.06258948892354965, 0.02493610419332981, -0.018480388447642326, -0.035079196095466614, 0.14365622401237488, -0.046609606593847275, 0.040164150297641754, -0.049927353858947754, 0.06781942397356033, -0.04828719049692154, 0.03496144339442253, -0.044686879962682724, 0.04254060238599777, 0.024320241063833237, -0.0031205937266349792, -0.049061503261327744, -0.028716804459691048, -0.056192029267549515, 0.022012677043676376, -0.0745186060667038, -0.0008951064082793891, -0.051033493131399155, 0.023357892408967018, 0.06984421610832214, 0.0057564410381019115, -0.005192344542592764, -0.003961252048611641, -0.012275456450879574, -0.018581852316856384, -0.0046620736829936504, 0.02494811825454235, 0.0520334355533123, -0.02435225434601307, 0.0008846594137139618, 0.017687007784843445, 0.07866063714027405, -0.025595100596547127, -0.020679078996181488, -0.027750879526138306, 0.10005537420511246, -0.015581297688186169, -0.08011393249034882, 0.028118811547756195],
"MUL":[-0.026987887918949127, 0.06016572564840317, 0.0787728950381279, -0.0803905576467514, 0.005736608523875475, -0.07245960086584091, -0.02662983350455761, 0.012340782210230827, 0.042490337044000626, 0.06399581581354141, -0.009004191495478153, 0.0370473749935627, -0.0605553574860096, -0.09520823508501053, 0.0010566662531346083, -0.028270091861486435, 0.08631408214569092, 0.002891023177653551, -0.051674507558345795, -0.04089691862463951, -0.04444378614425659, -0.061945777386426926, -0.026001833379268646, 0.04689744487404823, -0.07711070775985718, 0.07018855959177017, -0.02606336772441864, 0.054914504289627075, 0.03522270917892456, -0.027317974716424942, 0.02187947928905487, 0.009710998274385929, 0.01340037677437067, 0.016422593966126442, -0.058249425143003464, -0.08377814292907715, -0.04476138949394226, 0.04349169507622719, 0.05062006786465645, 0.01706511154770851, 0.020649245008826256, 0.06287672370672226, -0.03981941193342209, 0.04973218962550163, -0.03353424742817879, -0.016799092292785645, -0.031751759350299835, 0.10430201143026352, -0.04326871410012245, 0.0736854076385498, -0.0768580436706543, -0.03183818608522415, 0.010583195835351944, 0.015541432425379753, 0.03191666305065155, 0.020011236891150475, 0.041239380836486816, -0.0029152908828109503, 0.009499716572463512, -0.011166329495608807, 0.03469998389482498, 0.00607832008972764, 0.030300112441182137, -0.040855471044778824, 0.00988304428756237, 0.050531189888715744, 0.06647889316082001, -0.027519647032022476, -0.06819992512464523, 0.02215251699090004, 0.086424820125103, -0.03395787626504898, -0.020825445652008057, 0.08309803158044815, -0.0256529338657856, 0.005000723991543055, -0.03375622257590294, 0.005569287110120058, -0.028089171275496483, 0.04142652079463005, -0.03232670575380325, 0.025872791185975075, -0.07439207285642624, 0.04975134879350662, 0.049770113080739975, -0.05090470612049103, -0.04476647078990936, 0.09217675030231476, 0.05079415813088417, 0.017867455258965492, -0.04477125406265259, 0.004301204811781645, 0.05066722631454468, -0.08186711370944977, 0.008772231638431549, -0.10532139241695404, 0.004499110858887434, 0.03296274691820145, -0.0020684772171080112, 0.05012065917253494],
- "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983],
"NEG":[-0.0585959330201149, -0.02519698068499565, 0.029133861884474754, -0.003332944354042411, 0.05054186284542084, -0.03572014719247818, -0.012210451066493988, 0.06708117574453354, -0.0712793841958046, -0.01644597202539444, 0.06453811377286911, -0.03662518784403801, 0.0545802004635334, -0.11130833625793457, -0.04544609412550926, 0.012950814329087734, 0.08011337369680405, 0.014672964811325073, 0.0030391360633075237, -0.10994786024093628, 0.004102041013538837, -0.0749390497803688, -0.010000540874898434, 0.062072113156318665, 0.03312767669558525, -0.04764379560947418, -0.033307697623968124, 0.02903047949075699, 0.0319744311273098, 0.027374137192964554, -0.05640692263841629, -0.01572772115468979, 0.019634589552879333, 0.0629790723323822, -0.024743184447288513, -0.09348101913928986, 0.04078087955713272, 0.0002063393039861694, 0.01791796088218689, -0.01174850668758154, 0.0067609078250825405, 0.031922854483127594, 0.045338794589042664, -0.06706424057483673, -0.03090975433588028, 0.035511564463377, 0.0377444289624691, -0.007464382331818342, 0.02387971244752407, -0.023001981899142265, -0.0052301278337836266, 0.08532170951366425, 0.00384823651984334, 0.0689602717757225, -0.05606595426797867, 0.03483026847243309, 0.023350417613983154, -0.06512849777936935, 0.0627395287156105, -0.0203714482486248, -0.009735504165291786, 0.06432165950536728, -0.04546240717172623, 0.0322086475789547, 0.004561635199934244, 0.040702879428863525, -0.0680280476808548, 0.025354159995913506, -0.07624178379774094, 0.06776861846446991, 0.07863514125347137, -0.037652503699064255, -0.023264721035957336, 0.030604641884565353, -0.07419195026159286, 0.014679630286991596, 0.1294829547405243, 0.007591600529849529, -0.06612348556518555, 0.03127516806125641, 0.10645392537117004, -0.018773522228002548, 0.03992835432291031, 0.044048961251974106, 0.00023814172891434282, -0.06797933578491211, -0.08000202476978302, -0.04320430010557175, 0.043590281158685684, -0.05034546181559563, 0.014501169323921204, 0.03329288214445114, 0.03045976720750332, -0.01932660862803459, -0.026188183575868607, -0.1232738122344017, -0.04858024790883064, -0.015570580027997494, 0.013346930965781212, 0.009410912171006203],
"NOT":[0.02556992694735527, -0.0005189123330637813, 0.010195978917181492, -0.027382172644138336, -0.0374554842710495, 0.08793098479509354, 0.0024311996530741453, -0.08769379556179047, -0.054654307663440704, -0.08747632801532745, 0.09218847006559372, 0.0972878560423851, 0.044738128781318665, -0.02398994378745556, -0.046165600419044495, -0.0002692296984605491, -0.03797682002186775, 0.05161413550376892, -0.033769138157367706, 0.011279402300715446, 0.08941229432821274, -0.07437314093112946, -0.025249861180782318, 0.1026485413312912, -0.042062994092702866, 0.022835882380604744, 0.05108749121427536, -0.054616689682006836, -0.04208545386791229, 0.10205414891242981, -0.02474227361381054, -0.01605238951742649, -0.011079655028879642, -0.04231556877493858, -0.058844879269599915, 0.0017704797210171819, 0.005396600812673569, -0.058835554867982864, 0.03384264558553696, -0.024245088919997215, 0.03355555981397629, 0.02017929218709469, 0.04421762749552727, 0.09027500450611115, 0.03916880115866661, 0.042518291622400284, 0.024490609765052795, 0.00026937652728520334, -0.010342003777623177, -0.05488119646906853, 0.07418034970760345, 0.0008032438345253468, 0.09190968424081802, 0.07747997343540192, -0.024773627519607544, 0.0496656633913517, -0.038326963782310486, -0.0022213482297956944, 0.02448110282421112, 0.0022990668658167124, 0.052763812243938446, 0.051123637706041336, 0.03795074671506882, 0.06734737008810043, -0.030445149168372154, 0.021410485729575157, -0.044919464737176895, -0.0011586989276111126, -0.0903671532869339, -0.01408425159752369, 0.07342954725027084, -0.04118982329964638, -0.008432484231889248, -0.0008165669860318303, -0.0642886608839035, 0.007230957038700581, -0.0670868456363678, -0.01116579957306385, -0.09545603394508362, -0.03109285980463028, 0.005951744969934225, 0.024672016501426697, -0.04027184471487999, 0.03607063740491867, 0.023179687559604645, 0.0117312828078866, -0.019768331199884415, -0.023262612521648407, 0.04165903106331825, -0.039224691689014435, 0.040571704506874084, 0.08653629571199417, 0.027772698551416397, -0.08196783810853958, -0.013821743428707123, 0.004212009254842997, 0.01664070598781109, -0.008459849283099174, 0.041462354362010956, 0.06886350363492966],
"OR":[-0.0010318798013031483, -0.058885037899017334, 0.015562368556857109, -0.03459857404232025, -0.006239954382181168, 0.04347813501954079, -0.043183062225580215, -0.06115246191620827, -0.08097145706415176, -0.040188197046518326, 0.02098822593688965, -0.013338722288608551, -0.01845080405473709, -0.07172099500894547, -0.00026761949993669987, 0.015059647150337696, -0.08275016397237778, 0.10280061513185501, -0.017712965607643127, -0.07511771470308304, 0.007648291997611523, -0.12827979028224945, -0.020353827625513077, 0.08809063583612442, -0.02829514630138874, 0.003038457129150629, -0.04399721696972847, 0.046383049339056015, 0.06416497379541397, -0.0006932668038643897, -0.033501505851745605, -0.012374987825751305, 0.018504725769162178, 0.00529597420245409, -0.040804456919431686, -0.00419827364385128, -0.017476536333560944, -0.04530858248472214, 0.01608600653707981, -0.08898036181926727, -0.015132613480091095, -0.053797122091054916, -0.011825251393020153, 0.09507828205823898, 0.08454664051532745, 0.04075947031378746, 0.020354142412543297, 0.01704799383878708, -0.026439497247338295, -0.04004717990756035, -0.053405825048685074, 0.04079057276248932, 0.026150185614824295, 0.04538597911596298, 0.046778932213783264, 0.057205770164728165, 0.037173718214035034, -0.07114585489034653, 0.03480122983455658, 0.0069038826040923595, -0.056386105716228485, -0.03294815868139267, 0.04636325314640999, -0.05767818167805672, -0.05788124352693558, -0.011048000305891037, -0.04350278526544571, 0.029680529609322548, -0.0512658953666687, 0.04321866109967232, 0.047014784067869186, -0.014913392253220081, -0.007425297982990742, -0.09810416400432587, -0.07316632568836212, 0.05063875392079353, -0.07298189401626587, -0.012434680946171284, -0.09386061877012253, 0.016765601933002472, 0.06658460199832916, 0.0014198448043316603, -0.022241152822971344, 0.05902376398444176, 0.057584285736083984, 0.024565961211919785, -0.02896188013255596, 0.006485136691480875, 0.05981580168008804, -0.015995489433407784, 0.027470067143440247, 0.09679803997278214, 0.0342426523566246, -0.08387557417154312, -0.015599220991134644, -0.0049544889479875565, -0.06524655222892761, 0.02150602824985981, 0.016511479392647743, 0.055177561938762665],
@@ -115,7 +106,6 @@
"PCMPGTBrr":[-0.04665364325046539, -0.03588206693530083, 0.05219453573226929, 0.08376432955265045, 0.05562759190797806, -0.0034289404284209013, 0.08200010657310486, 0.023898538202047348, -0.002851601457223296, -0.08778133243322372, 0.017107484862208366, 0.08448091894388199, 0.020043527707457542, 0.038858626037836075, 0.036468397825956345, -0.0069902255199849606, -0.09442859143018723, 0.0018075992120429873, 0.05577728524804115, -0.0005804274696856737, 0.029588190838694572, -0.050955869257450104, 0.016604335978627205, -0.054141607135534286, -0.030936168506741524, 0.004688458051532507, -0.02321118488907814, -0.009524177759885788, 0.030161075294017792, -0.0557246096432209, 0.017830688506364822, 0.04058525711297989, 0.023080267012119293, 0.04536818340420723, 0.09658516198396683, 0.004083207808434963, 0.053284309804439545, 0.07114734500646591, 0.03272407501935959, -0.06646303087472916, 0.08200454711914062, -0.06558514386415482, 0.0745493471622467, -0.0010506648104637861, -0.02250707894563675, 0.015057512558996677, -5.047186277806759e-06, 0.04663649946451187, 0.06489380449056625, -0.0477377213537693, -0.08882559835910797, 0.08948437124490738, -0.052260447293519974, 0.06798093020915985, -0.06404604762792587, 0.0005905702710151672, 0.014312930405139923, 0.0370929092168808, 0.03622571751475334, 0.06601805984973907, 0.04077596217393875, -0.0019877473823726177, -0.02357509359717369, 0.04524341970682144, 0.024309739470481873, -0.05969798564910889, -0.015872884541749954, -0.055400021374225616, 0.04820183292031288, 0.024034500122070312, -0.05125486105680466, 0.020366262644529343, 0.03310052305459976, 0.1036759540438652, 0.049202825874090195, -0.010945710353553295, -0.030628688633441925, 0.048871662467718124, 0.07457619905471802, 0.017111260443925858, 0.028184816241264343, -0.09065181016921997, -0.017116032540798187, -0.06233282387256622, -0.011385255493223667, -0.06190027296543121, -0.01189250499010086, -0.03632708638906479, 0.04705822467803955, 0.0022293981164693832, 0.06782552599906921, -0.0490303635597229, -0.08690774440765381, -0.08311695605516434, 0.04079030826687813, 0.022971853613853455, -0.019726071506738663, -0.032829709351062775, -0.05147984251379967, -0.06768873333930969],
"PCMPGTDrr":[0.026095403358340263, 0.009877854026854038, -0.022390423342585564, -0.06749505549669266, 0.03866114094853401, 0.07523459941148758, -0.02331429533660412, -0.013958744704723358, -0.05151516944169998, -0.033018071204423904, -0.017118683084845543, 0.06611985713243484, 0.024562569335103035, 0.027193237096071243, -0.04081164300441742, -0.0557839497923851, 0.07676059752702713, -0.017435213550925255, -0.0696197971701622, 0.04529204219579697, 0.015718640759587288, -0.0868423655629158, -0.025476763024926186, 0.1075882539153099, 0.08407340198755264, 0.03219793736934662, -0.029079284518957138, -0.10067792236804962, -0.01665782555937767, -0.002518820110708475, 0.06302576512098312, -0.042360853403806686, -0.014688530936837196, -0.04797102138400078, -0.05708448588848114, 0.05345156416296959, -0.03360274061560631, -0.006362707354128361, 0.045909661799669266, -0.0034944594372063875, -0.04771789163351059, -0.015326191671192646, -0.017800530418753624, 0.009678518399596214, -0.01412744726985693, 0.09620117396116257, 0.0705861821770668, -0.0663042888045311, 0.07589521259069443, -0.08846025168895721, 0.008178732357919216, -0.023293234407901764, 0.049390021711587906, 0.00771696399897337, -0.026583032682538033, 0.012981866486370564, -0.06098538264632225, -0.04784953594207764, -0.001411060569807887, -0.0646580159664154, 0.07771933078765869, 0.012061100453138351, 0.026251494884490967, 0.024035189300775528, 0.00368816708214581, 0.019370727241039276, 0.0473535880446434, 0.0688827782869339, -0.0656280517578125, 0.0001225982268806547, -0.04765431582927704, 0.08570858836174011, 0.06544618308544159, 0.02309294231235981, -0.07891835272312164, 0.05969972908496857, 0.04259306937456131, -0.0388357900083065, 0.10700955986976624, -0.03643207252025604, -0.014097973704338074, 0.018475063145160675, -0.008959461003541946, -0.04132810980081558, -0.01586003415286541, -0.013873838819563389, 0.07354859262704849, 0.003967848140746355, -0.023853322491049767, -0.013099947944283485, 0.06407736241817474, 0.03060499019920826, -0.08859552443027496, 0.009045977145433426, -0.09939071536064148, -0.022137949243187904, -0.03951180726289749, -0.0316530205309391, 0.05501912534236908, -0.06330689787864685],
"PEXTRWrr":[-0.05698293820023537, -0.02332535944879055, -0.01313185878098011, 0.08844685554504395, -0.030702419579029083, -0.042257267981767654, -0.06976033002138138, 0.08907881379127502, 0.040486857295036316, 0.01966431364417076, 0.011261478997766972, 0.011022844351828098, -0.0069642444141209126, -0.016230706125497818, -0.009695738554000854, -0.04666578397154808, 0.016855308786034584, -0.03308985382318497, 0.01504850760102272, 0.09940154105424881, -0.07109691947698593, 0.043378811329603195, -0.06964893639087677, -0.05999808758497238, 0.008651218377053738, 0.04237857088446617, 0.04557272046804428, 0.04033806174993515, -0.005760873202234507, 0.008976156823337078, 0.05276636406779289, -0.06584233790636063, -0.011512805707752705, -0.01598522625863552, -0.044132646173238754, -0.020889364182949066, -0.09435509145259857, -0.02823605202138424, 0.0820322185754776, -0.0391690619289875, 0.03367430716753006, -0.029474111273884773, -0.07719384133815765, 0.003098628716543317, 0.05822441354393959, -0.09175454080104828, 0.02256210707128048, -0.004901964217424393, -0.008566503413021564, 0.040359016507864, -0.04049991816282272, 0.010366388596594334, -0.05293237417936325, -0.0956558957695961, -0.01418458204716444, 0.05464276298880577, -0.014091472141444683, 0.023551519960165024, -0.042662639170885086, -0.07025191932916641, -0.0017952515045180917, 0.07680258899927139, -0.10743812471628189, -0.08435508608818054, -0.00337960640899837, -0.03381747379899025, 0.027066459879279137, -0.009784750640392303, 0.04265652969479561, 0.02066781371831894, -0.03692338988184929, 0.0029027678538113832, 0.06893923878669739, 0.03784753382205963, -0.04037536308169365, -0.09532847255468369, 0.03193795308470726, 0.0387917198240757, 0.03887058049440384, -0.0002478501701261848, -0.0671166405081749, 0.06754262745380402, 0.01643708348274231, 0.012460017576813698, 0.03147564455866814, 0.05646798014640808, 0.014081758446991444, 0.07141963392496109, 0.016428180038928986, 0.0443485863506794, 0.06492826342582703, 0.09964785724878311, 0.026795320212841034, 0.0271765124052763, -0.015695465728640556, -0.08133535832166672, -0.05439477041363716, 0.04913243651390076, 0.024485180154442787, -0.04072758927941322],
- "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063],
"PMOVMSKBrr":[0.07294902205467224, -0.00040799094131216407, -0.01483855675905943, -0.02571418508887291, 0.08466307818889618, -0.03447218984365463, -0.05685977265238762, -0.019133185967803, 0.06332023441791534, -0.061352625489234924, -0.023195402696728706, -0.05378473922610283, -0.05650350823998451, 0.06583224982023239, -0.012845925986766815, -0.052972156554460526, 0.049470845609903336, -0.04565730318427086, 0.09717552363872528, -0.014171762391924858, 0.013508875854313374, 0.004057068843394518, -0.020556267350912094, -0.10475417971611023, 0.018426941707730293, -0.07273723930120468, 0.01702595315873623, -0.013097747229039669, -0.07530277967453003, 0.05442536994814873, -0.0601920410990715, -0.05255919322371483, -0.07305102050304413, 0.02758030779659748, 0.06180129200220108, 0.10606050491333008, 0.046477098017930984, -0.024062691256403923, 0.07360008358955383, -0.011283098720014095, -0.03712400794029236, -0.09973011910915375, 0.018314119428396225, 0.009135990403592587, -0.01891133189201355, 0.00915572326630354, 0.006080301944166422, -0.02368554100394249, -0.019582828506827354, 0.051494162529706955, -0.010953089222311974, 0.011621126905083656, 0.010515356436371803, 0.011188569478690624, -0.0202876515686512, 0.038686931133270264, -0.066365085542202, 0.014182188548147678, 0.00445093447342515, 0.05712618678808212, -0.04463819041848183, -0.10292281210422516, -0.011173201724886894, 0.0029098563827574253, 0.06890314072370529, 0.06398330628871918, 0.03248615562915802, -0.05457807704806328, -0.006898659747093916, 0.038892313838005066, -0.09130232781171799, 0.013324378058314323, -0.033766016364097595, -0.043404608964920044, 0.018701359629631042, -0.03784232959151268, -0.05014420300722122, 0.04404780641198158, 0.09254389256238937, 0.09839074313640594, -0.028214668855071068, 0.03262662887573242, 0.04281335324048996, 0.07356158643960953, -0.0773080587387085, 0.026536725461483, -0.06819723546504974, 0.03335537016391754, 0.09355103969573975, -0.052649617195129395, -0.08467497676610947, -0.06516479700803757, -0.07499512284994125, 0.023276200518012047, -0.06063856557011604, -0.044472258538007736, 0.03155883774161339, -0.011262890882790089, 0.04045895114541054, 0.012343645095825195],
"PMULUDQrr":[-0.018331514671444893, 0.04249238595366478, 0.0718526765704155, 0.03221653401851654, -0.04829120263457298, -0.02055567130446434, 0.05200991779565811, -0.04337913170456886, -0.02698952704668045, 0.05037892237305641, 0.014545431360602379, 0.09035851061344147, 0.0777752548456192, -0.06762461364269257, 0.032133519649505615, 0.048851024359464645, 0.01295433659106493, 0.054136257618665695, 0.09599477052688599, 0.024489495903253555, 0.05683024227619171, -0.05242127552628517, -0.043476004153490067, 0.004586773458868265, 0.024281315505504608, 0.03402777388691902, 0.0033939755521714687, 0.049474406987428665, 0.0011405921541154385, 0.06828528642654419, 0.08426304161548615, -0.029339993372559547, -0.04173621907830238, -0.03966334089636803, -0.03011258877813816, -0.07684683799743652, 0.040944185107946396, -0.04709877818822861, 0.07968004047870636, 0.07534269988536835, -0.006957313045859337, -0.0016522067598998547, -0.017229178920388222, 0.030470186844468117, 0.05390452966094017, 0.05233803763985634, 0.045554302632808685, -0.03710555285215378, 0.05699322372674942, 0.019888387992978096, 0.10152119398117065, 0.026563912630081177, -0.0018862299621105194, -0.02453959546983242, -0.06107368320226669, -0.04910692200064659, -0.06316373497247696, 0.04648333042860031, -0.00939352996647358, 0.030374331399798393, 0.0027768383733928204, 0.07302171736955643, -0.0035402378998696804, 0.054474033415317535, -0.0739617869257927, 0.01190911140292883, -0.019428657367825508, -0.006644500885158777, -0.04998863860964775, 0.03215506672859192, 0.054085105657577515, 0.047874726355075836, 0.10735851526260376, 0.030255280435085297, 0.029996531084179878, 0.006218941882252693, 0.04892734810709953, 0.06425125896930695, -0.017792150378227234, 0.041398752480745316, -0.017293022945523262, -0.011015499010682106, -0.02933122031390667, -0.005825115367770195, -0.07212502509355545, 0.10469445586204529, 0.009840304031968117, 0.026172513142228127, 0.002459621522575617, -0.02771947532892227, -0.006639100145548582, -0.04062161594629288, -0.0746249407529831, 0.04523816704750061, -0.07439430058002472, 0.06977812945842743, 0.008738852106034756, 0.06937781721353531, 0.07391723990440369, -0.09542208909988403],
"POPCNT":[0.032459065318107605, 0.11127372831106186, -0.004006756469607353, 0.06373029947280884, 0.07161973416805267, -0.07966824620962143, -0.014274416491389275, 0.02168503776192665, -0.060636017471551895, -0.051414258778095245, 0.003268218832090497, 0.05552225932478905, 0.01940925046801567, -0.05398592725396156, 0.09021458029747009, -0.060922130942344666, -0.0407782681286335, -0.027882883325219154, 0.012706448324024677, -0.02730434015393257, 0.05854162946343422, -0.0798129290342331, -0.00179530237801373, 0.04958317428827286, -0.04621487483382225, 0.0524308979511261, -0.03889109939336777, 0.07240460813045502, 0.06366933137178421, 0.029314585030078888, -0.014743340201675892, -0.021233027800917625, 0.06803205609321594, -0.01269250176846981, -0.033408213406801224, 0.09638478606939316, -0.02009841799736023, -0.014619074761867523, 0.022498659789562225, 0.006679723970592022, -0.016163295134902, 0.09717728197574615, -0.010882971808314323, -0.09489153325557709, 0.046623144298791885, -0.04596618935465813, -0.026864662766456604, 0.01605546846985817, 0.05979238823056221, -0.024411896243691444, 0.039511535316705704, -0.0108433086425066, -0.05629622936248779, 0.02339898608624935, -0.025785285979509354, 0.011886742897331715, 0.08834438771009445, -0.08506806194782257, 0.021776534616947174, 0.01446699257940054, -0.009117010980844498, -0.022380229085683823, -0.0541100800037384, -0.040569182485342026, -0.02888612262904644, 0.07774273306131363, -0.052350424230098724, -0.039240963757038116, 0.004771160893142223, 0.014987779781222343, -0.05511622130870819, 0.019763313233852386, -0.0920683741569519, 0.021821241825819016, 0.10812623798847198, -0.06422155350446701, -0.07388156652450562, 0.00949418731033802, -0.06905169039964676, 0.006180475000292063, -0.02844754233956337, 0.11084792017936707, -0.03348945826292038, 0.06860767304897308, -0.0214154664427042, -0.0008655296987853944, -0.020698973909020424, 0.03369581326842308, 0.019848104566335678, 0.013533092103898525, 0.03423681482672691, 0.014547858387231827, 0.02418140508234501, -0.013769546523690224, -0.09633788466453552, 0.01689709909260273, -0.01452709175646305, 0.047873757779598236, -0.0012036423431709409, 0.03720762953162193],
@@ -145,7 +135,6 @@
"RET":[-0.09685279428958893, 0.0101965656504035, -0.04206235706806183, -0.05282443389296532, 0.050776951014995575, -0.006812752690166235, 0.09618920832872391, 0.04637071117758751, -0.018928129225969315, -0.04118828848004341, -0.06039129197597504, -0.018619466572999954, -0.07845143973827362, -0.14034120738506317, -0.03397035226225853, -0.028233898803591728, -0.08162513375282288, 0.048710327595472336, -0.04177732393145561, -0.08455172181129456, 0.00312337395735085, 0.03531079366803169, -0.057201240211725235, 0.09391707926988602, -0.02847883477807045, 0.01840023323893547, -0.04936904460191727, 0.027487540617585182, 0.08041024953126907, -0.08714525401592255, 0.11963017284870148, -0.0762581005692482, -0.06482874602079391, 0.038007382303476334, -0.003661463735625148, 0.0064629544503986835, -0.08281382918357849, -0.053177930414676666, 0.01966426707804203, -0.04822755232453346, -0.0474051795899868, 0.026990806683897972, -0.057971399277448654, 0.12347304075956345, -0.02745792828500271, 0.0832793116569519, 0.03029884397983551, -0.032751865684986115, -0.022912420332431793, -0.030569355934858322, 0.0971289873123169, 0.07298070192337036, 0.0306894201785326, 0.05817654728889465, 0.005174126010388136, 0.042281877249479294, 0.01975836046040058, -0.11205509305000305, 0.05081645026803017, 0.0034761943388730288, -0.03858469799160957, 0.007316718343645334, 0.07441510260105133, 0.004579664673656225, -0.021868426352739334, 0.01116174180060625, 0.061042461544275284, 0.029598504304885864, -0.06691239774227142, 0.03223221376538277, 0.0867755264043808, 0.05488765984773636, -0.019738517701625824, -0.030367519706487656, -0.06396497040987015, -0.0022451707627624273, -0.06131305173039436, -0.03129804506897926, -0.05657076835632324, 0.009733426384627819, -0.08145039528608322, -0.09049411863088608, 0.004821183159947395, 0.038612931966781616, -0.019062234088778496, -0.021097682416439056, -0.06061801686882973, 0.019766775891184807, 0.0276743546128273, -0.057942990213632584, -0.033430278301239014, 0.0043391571380198, 0.05848158895969391, 0.0826464518904686, 0.09988056123256683, -0.05677378550171852, -0.11326800286769867, 0.051275406032800674, 0.01158174965530634, 0.04368240013718605],
"ROL":[0.026423713192343712, 0.08523924648761749, 0.005345864687114954, 0.027778491377830505, 0.06572498381137848, 0.056946828961372375, -0.03009108640253544, -0.05564097315073013, 0.07753216475248337, -0.07402804493904114, -0.05589171126484871, -0.050976503640413284, 0.041095346212387085, -0.06708681583404541, 0.08517566323280334, 0.02110634744167328, -0.027871981263160706, 0.0005450723110698164, 0.07511565834283829, 0.0016275837551802397, 0.04902505874633789, 0.024746844545006752, 0.08780711144208908, -0.06167766824364662, 0.06365402787923813, 0.06462119519710541, -0.04920244216918945, 0.056112516671419144, 0.10561680048704147, 0.07879003882408142, 0.03879575803875923, -0.03582729026675224, 0.004805437754839659, 0.030719229951500893, 0.0558336041867733, 0.04387545958161354, 0.020841658115386963, 0.015068157576024532, -0.008266274817287922, 0.05914990231394768, -0.01581275276839733, 0.060716625303030014, -0.02257946878671646, 0.00995479617267847, 0.002104438142850995, 0.03806104138493538, 0.010437156073749065, 0.039603881537914276, -0.02074524573981762, 0.024094516411423683, 0.031944990158081055, -0.07122939079999924, -0.023190783336758614, 0.006518832873553038, -0.04528677463531494, -0.02354210615158081, -0.03518632799386978, -0.07059651613235474, -0.017474880442023277, 0.06688393652439117, -0.07900173962116241, -0.05843310430645943, 0.05351021885871887, -0.05724814161658287, -0.02697751857340336, -0.031128596514463425, 0.03040527179837227, -0.009157841093838215, -0.07642515003681183, -0.042137425392866135, -0.031383614987134933, -0.07586777210235596, 0.0489036925137043, 0.0657171905040741, 0.027123138308525085, 0.034842655062675476, -0.035231154412031174, 0.009778738021850586, -0.06150955334305763, 0.042132262140512466, 0.08945925533771515, -0.07213590294122696, -0.0518047958612442, -0.07094760239124298, 0.07041053473949432, 0.1046413779258728, 0.02394813485443592, -0.014966128394007683, -0.04967860132455826, -0.03941388055682182, -0.10642798990011215, -0.03915626183152199, -0.10921923071146011, -0.035421375185251236, 0.039855729788541794, 0.04145469889044762, -0.025123557075858116, 0.06743432581424713, -0.02060243859887123, 0.02994687482714653],
"ROR":[-0.03797177970409393, -0.03406170755624771, -0.014866529032588005, -0.002243943279609084, 0.024476991966366768, -0.08789698034524918, 0.02924288995563984, -0.03145875036716461, -0.0030907171312719584, 0.013303312472999096, 0.05823688209056854, 0.06085257977247238, 0.0682583823800087, 0.06680850684642792, -0.0008473473135381937, 0.056926507502794266, 0.05309343710541725, 0.017690004780888557, -0.028605103492736816, 0.02303914539515972, -0.07054196298122406, -0.011117611080408096, -0.0012138717574998736, -0.0877937376499176, -0.005339651368558407, -0.029197875410318375, -0.06283852458000183, 0.00677055399864912, 0.07529082894325256, -0.005144342314451933, -0.03930655121803284, -0.0469868965446949, 0.06799482554197311, -0.013870766386389732, -0.07353825122117996, -0.10425472259521484, 8.023920236155391e-05, 0.05196760594844818, -0.024758316576480865, -0.03249195218086243, -0.0037688545417040586, -0.0033505349420011044, 0.04382188990712166, 0.035679250955581665, -0.04743441194295883, 0.031142324209213257, -0.04255860671401024, -0.02310662344098091, -0.04199622571468353, -0.034439221024513245, -0.06397263705730438, -0.011049525812268257, -0.055776823312044144, 0.039233505725860596, 0.016644736751914024, -0.08737850934267044, 0.0151174021884799, 0.10728199779987335, -0.0006503594922833145, -0.060365013778209686, -0.05337308719754219, -0.021152105182409286, 0.06532585620880127, -0.00926337018609047, -0.08149554580450058, 0.0485830195248127, 0.034749776124954224, -0.05045035108923912, -0.06366241723299026, 0.0544571727514267, 0.07594002038240433, 0.027496861293911934, -0.047294747084379196, 0.017491186037659645, -0.034639474004507065, 0.006060798652470112, -0.07335491478443146, -0.054728057235479355, -0.0018357941880822182, -0.07110298424959183, 0.09072742611169815, 0.03083305060863495, 0.054598040878772736, -0.028097454458475113, -0.012821618467569351, 0.008708478882908821, -0.06561881303787231, -0.04448843002319336, 0.08860815316438675, -0.050312310457229614, 0.09012935310602188, -0.004711236339062452, -0.020932462066411972, -0.10615857690572739, -0.005630030296742916, 0.03976801037788391, 0.040199730545282364, 0.07235082983970642, -7.448523683706298e-05, 0.076942078769207],
- "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944],
"SAR":[-0.058561697602272034, -0.014889497309923172, -0.009758144617080688, 0.00019282882567495108, -0.040600407868623734, -0.05907759070396423, 0.033052023500204086, -0.04672614857554436, -0.050173744559288025, -0.06619776040315628, 0.005385559983551502, 0.05449973791837692, -0.0035163976717740297, -0.12835650146007538, 0.06576846539974213, 0.030572880059480667, -0.014856431633234024, 0.011252024210989475, 0.018954169005155563, -0.10070347040891647, -0.032273050397634506, 0.007221086882054806, -0.020879192277789116, 0.0691007450222969, 0.01286559458822012, -0.020694725215435028, -0.07545264810323715, -0.07742343097925186, -0.005103116389364004, 0.10223732143640518, -0.08521754294633865, 0.07459715753793716, 0.006563629489392042, -0.059839747846126556, -0.023294325917959213, 0.04265525937080383, -0.011012998409569263, -0.02257128618657589, -0.033783379942178726, 0.0368407666683197, -0.048024341464042664, -0.037417128682136536, 0.09010431170463562, 0.09016482532024384, -0.07939734309911728, 0.03274676203727722, 0.0388714037835598, -0.03253694251179695, 0.020820122212171555, -0.0039061333518475294, 0.025425976142287254, -0.01847209222614765, 0.013026821427047253, 0.08873090147972107, -0.010358930565416813, -0.026935681700706482, 0.04795868322253227, -0.06173045188188553, -0.02299962192773819, -0.09966729581356049, 0.008027775213122368, 0.03202224150300026, -0.08922284096479416, 0.03263246268033981, 0.0702379047870636, 0.08681228011846542, -0.053993936628103256, 0.0009890834335237741, -0.060423459857702255, 0.08636976033449173, 0.04784319922327995, 0.05135124549269676, -0.023515762761235237, 0.015414481982588768, -0.06941155344247818, 0.004289102740585804, -0.10909571498632431, 0.014149827882647514, -0.025285568088293076, 0.06270574778318405, 0.0669349953532219, 0.03599094599485397, 0.0436582937836647, 0.06281902641057968, -0.04479018226265907, -0.04126136004924774, -0.026938045397400856, -0.0349077507853508, 0.002964549232274294, -0.04247729107737541, 0.009402072057127953, 0.10574454814195633, 0.03262042999267578, 0.08030910044908524, -0.031244831159710884, 0.010621835477650166, -0.02628093585371971, 0.046942535787820816, -0.022998474538326263, 0.009223603643476963],
"SBB":[-0.040700677782297134, -0.01474229246377945, 0.09491399675607681, 0.015464535914361477, -0.05408482998609543, -0.09618491679430008, -0.014700816012918949, -0.06255258619785309, 0.09308589994907379, 0.01991264335811138, 0.04899228736758232, -0.03322140499949455, -0.03979090601205826, -0.161369189620018, 0.0957769826054573, -0.045866891741752625, -0.03776619955897331, 0.09559016674757004, -0.0063005415722727776, 0.07086999714374542, -0.004713557660579681, 0.10066409409046173, -0.053719762712717056, 0.07039386034011841, 0.01788068749010563, 0.01069885678589344, -0.003849055850878358, 0.07810717821121216, 0.10748977214097977, -0.09462521225214005, -0.06140149384737015, -0.028434589505195618, 0.0395897701382637, 0.05396975204348564, 0.009982907213270664, -0.014297235757112503, 0.018435295671224594, -0.04264533147215843, -0.0471954308450222, -0.008587008342146873, 0.010918513871729374, -0.03147284686565399, 0.08885594457387924, 0.05178891867399216, 0.05807363614439964, 0.028190992772579193, 0.04205470532178879, 0.00935433991253376, 0.027427801862359047, -0.02180725708603859, -0.06614664196968079, 0.021269382908940315, 0.0585390068590641, 0.12827278673648834, 0.0420454666018486, 0.06753493845462799, -0.05479112267494202, -0.06480395793914795, 0.02621031180024147, -0.07586188614368439, -0.04831313341856003, 0.016674980521202087, -0.006851759273558855, 0.04103298485279083, 0.005965645890682936, -0.02317493036389351, -0.03966135531663895, -0.02576862834393978, -0.0916895642876625, 0.029451601207256317, 0.044677067548036575, 0.026928072795271873, -0.10388721525669098, 0.021140936762094498, -0.06990157812833786, 0.048356350511312485, -0.08890967816114426, -0.0003503488842397928, -0.10245566070079803, -0.0582563653588295, 0.04677841439843178, 0.04697449132800102, -0.04022470489144325, 0.02759086713194847, -0.02867579087615013, 0.013355317525565624, 0.011504339054226875, -0.04230086877942085, -0.045500747859478, -0.03741880878806114, 0.022458063438534737, 0.05192841589450836, 0.008104681968688965, -0.08284809440374374, 0.059996478259563446, 0.07762005180120468, -0.0031316280364990234, 0.06990513950586319, -0.020328091457486153, -0.0027387691661715508],
"SETB_C":[-0.007473401725292206, -0.06315194815397263, 0.0693482831120491, 0.05207814276218414, -0.08006429672241211, -0.005448522046208382, -0.007457572966814041, 0.011581258848309517, -0.05411145091056824, -0.06738752871751785, 0.013233165256679058, -0.0677611380815506, 0.01846255734562874, -0.09321920573711395, -0.03116961196064949, 0.05861300230026245, -0.001519175828434527, 0.08354826271533966, 0.023905213922262192, 0.0124649154022336, 0.08983863890171051, 0.055941760540008545, 0.07229111343622208, 0.09052376449108124, -0.013718990609049797, 0.06642850488424301, 0.0822976604104042, 0.010060268454253674, 0.04116540774703026, -0.03301406651735306, 0.07296404242515564, -0.03534134477376938, 0.012426529079675674, -0.005412430968135595, 0.06087784096598625, 0.03547677770256996, -0.007232111878693104, 0.06580550968647003, -0.0037480974569916725, 0.02971699647605419, -0.06937503069639206, 0.08572175353765488, -0.02138090692460537, -0.0053040217608213425, -0.029469722881913185, -0.05332958698272705, 0.10073655843734741, -0.03199373558163643, -0.01775289885699749, -0.09716105461120605, 0.06483447551727295, 0.028643250465393066, -0.029914388433098793, 0.007070464547723532, 0.006640028208494186, -0.0033612342085689306, 0.005682659335434437, 0.011877131648361683, -0.038144148886203766, 0.03381858021020889, 0.02083616890013218, 0.029199717566370964, 0.07813020050525665, -0.006173993926495314, -0.016444502398371696, -0.08474857360124588, 0.03877300024032593, -0.046462398022413254, 0.02460806630551815, 0.053950369358062744, 0.01389766950160265, 0.03323421627283096, 0.04349416866898537, 0.04381947219371796, 0.10320119559764862, -0.1117740124464035, 0.03045269101858139, -0.03870442137122154, -0.07607249915599823, -0.00020808610133826733, -0.09519094228744507, 0.06727365404367447, -0.04469249024987221, 0.07144048810005188, -0.08811240643262863, 0.001203814405016601, 0.06901863217353821, 0.05462682247161865, -0.03902207687497139, -0.05885632708668709, -0.028275305405259132, -0.07151838392019272, -0.059166230261325836, -0.015570566058158875, 0.06314826756715775, -0.040293656289577484, 0.021595094352960587, -0.04083842411637306, -0.09180022031068802, -0.0309903621673584],
@@ -263,26 +252,6 @@
"VINSERTI":[0.09046315401792526, 0.015515458770096302, 0.04200809448957443, -0.046130646020174026, -0.045843131840229034, 0.003743539797142148, 0.025380345061421394, -0.021319231018424034, 0.03850293532013893, 0.006397924851626158, -0.06982530653476715, -0.016159888356924057, -0.09164588898420334, 0.1245846077799797, -0.05104857683181763, -0.011446121148765087, -0.06936608999967575, -0.02683587372303009, -0.0337526798248291, -0.005495472811162472, 0.023584537208080292, -0.0771733894944191, -0.026287894695997238, -0.01172191184014082, 0.09737047553062439, 0.0375351756811142, 0.03280220180749893, -0.014072075486183167, 0.06032971292734146, 0.0072259255684912205, -0.08368974179029465, 0.054626062512397766, -0.021156134083867073, -0.09647785127162933, 0.07431179285049438, -0.09039300680160522, -0.07652204483747482, -0.002478789770975709, -0.012967151589691639, 0.08174770325422287, -0.00968913547694683, 0.015551612712442875, 0.08655177801847458, -0.056927114725112915, -0.011370684020221233, -0.0408773347735405, 0.04413295164704323, 0.05919815972447395, 0.08101782202720642, -0.008914918638765812, -0.019233090803027153, 0.05211508646607399, -0.010292282328009605, 0.021839600056409836, 0.0016950241988524795, -0.031931016594171524, 0.004831018857657909, 0.015328328125178814, -0.015326892025768757, -0.05457184836268425, 0.03782501816749573, -0.014512602239847183, 0.00869232788681984, -0.04001179710030556, -0.00994281005114317, -0.041689563542604446, 0.060574647039175034, 0.044912341982126236, 0.05958174169063568, -0.035378437489271164, 0.08524063974618912, 0.012326095253229141, -0.052227456122636795, -0.015090923756361008, -0.012893415056169033, -0.019565775990486145, -0.03284028172492981, -0.02651887759566307, 0.02436136268079281, 0.004743371158838272, 0.019924448803067207, -0.046163417398929596, 0.005615816451609135, -0.03354670852422714, 0.00801338255405426, 0.02501787059009075, 0.03313247114419937, -0.012842012569308281, 0.04856807366013527, -0.031942710280418396, -0.026277944445610046, 0.11483185738325119, 0.015686793252825737, 0.052031729370355606, 0.025188622996211052, -0.021448785439133644, 0.05062439665198326, -0.030834710225462914, 0.02746596559882164, 0.027027780190110207],
"VINSERTPSrm":[0.010265239514410496, 0.03508870303630829, 0.0182888712733984, 0.01066108699887991, 0.09608251601457596, -0.0390457920730114, -0.02508910745382309, -0.051061324775218964, 0.051924970000982285, 0.02405509166419506, 0.07347054034471512, -0.023432396352291107, 0.03053455613553524, 0.11051331460475922, -0.07987380027770996, -0.07169938087463379, -0.06944284588098526, -0.010227087885141373, 0.01555782649666071, 0.0033066831529140472, -0.017572278156876564, 0.018880389630794525, -0.03347453102469444, 0.023936258628964424, 0.04189354181289673, -0.008910899050533772, 0.045309878885746, 0.039228133857250214, -0.0026367944665253162, 0.01713910512626171, -0.0038225038442760706, -0.02550170198082924, -0.04479162022471428, -0.11607012152671814, -0.05566885322332382, -0.03926549106836319, -0.05618799477815628, 0.0587196871638298, -0.003744689514860511, 0.09148088097572327, -0.008691483177244663, 0.060393813997507095, 0.05017181858420372, 0.05314680561423302, 0.010222898796200752, 0.04390108212828636, 0.06256565451622009, -0.039335936307907104, -0.030927496030926704, -0.010439696721732616, -0.09615844488143921, 0.04857023432850838, -0.021018074825406075, -0.04949686676263809, 0.0718517154455185, -0.0008082279819063842, -0.05119258537888527, -0.016725104302167892, -0.031902290880680084, 0.07473913580179214, -0.07040376961231232, -0.06263279169797897, -0.01866966485977173, -0.04580819234251976, -0.0018242622027173638, 0.02124813199043274, 0.01608111523091793, 0.033293697983026505, 0.04724595695734024, 0.06764823198318481, -0.010222701355814934, -0.09166357666254044, 0.0065320758149027824, 0.03907076641917229, 0.014404546469449997, -0.04371245950460434, -0.036747194826602936, -0.013570152223110199, -0.04874530807137489, -0.048001520335674286, 0.015114152804017067, 0.018710903823375702, 0.06920618563890457, -0.04024452716112137, 0.07851467281579971, -0.03879975154995918, 0.039278119802474976, -0.06678346544504166, -0.02139596827328205, 0.03483150899410248, 0.047795433551073074, -0.08071790635585785, 0.05023014172911644, -0.05863310769200325, 0.04391729459166527, -0.006796867586672306, -0.07156652212142944, -0.016402525827288628, 0.07568787783384323, 0.01640038751065731],
"VINSERTPSrr":[0.022902479395270348, 0.024018414318561554, 0.011011038906872272, 0.009072761051356792, -0.057881347835063934, 0.03739643841981888, 0.006768766324967146, 0.07734010368585587, 0.001509909750893712, 0.043335434049367905, -0.07244917005300522, -0.1078081876039505, 0.027186766266822815, 0.018834171816706657, 0.007436624728143215, -0.048498328775167465, 0.09450934827327728, -0.015420452691614628, 0.014672537334263325, -0.0012827727477997541, 0.019664635881781578, -0.026565955951809883, -0.04819858446717262, -0.0004387270309962332, 0.01676507107913494, -0.0014571163337677717, 0.015105879865586758, 0.04038102179765701, 0.008408628404140472, 0.07757255434989929, -0.09923559427261353, -0.04181523248553276, -0.0313955582678318, -0.006045420188456774, 0.05904707312583923, -0.014993838034570217, 0.03219055384397507, 0.058543696999549866, -0.06872320920228958, 0.021718619391322136, -0.08984571695327759, 0.06557019799947739, -0.018167613074183464, -0.011413732543587685, 0.036035921424627304, 0.10104569792747498, 0.05836406350135803, -0.02576756477355957, 0.03827998414635658, -0.06874323636293411, 0.01668366976082325, 0.048310816287994385, -0.010213772766292095, -0.035550933331251144, -0.03385040909051895, 0.004614332225173712, 0.018951643258333206, 0.10679180175065994, -0.019135646522045135, -0.011955377645790577, 0.028140606358647346, -0.08185642957687378, -0.015775075182318687, -0.011507326737046242, 0.07914295047521591, -0.030148068442940712, 0.11757981777191162, 0.00040086795343086123, 0.056880321353673935, -0.014461426064372063, -0.0008378245402127504, -0.06245473772287369, -0.05332277715206146, -0.038401950150728226, 0.005011103581637144, 0.0368003323674202, -0.021230563521385193, 0.01497745979577303, 0.0372738242149353, 0.07988940924406052, -0.013381360098719597, 0.0036820468958467245, 0.07501927763223648, -0.0996084213256836, 0.028014982119202614, -0.09410325437784195, 0.0007335525006055832, 0.004884959198534489, 0.040397197008132935, -0.07655651122331619, 0.05677357316017151, 0.005359896458685398, -0.047478366643190384, 0.03828851506114006, 0.03363208472728729, -0.041756175458431244, -0.00031817640410736203, 0.022837577387690544, 0.039567966014146805, 0.03662540763616562],
- "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894],
- "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718],
- "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234],
- "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654],
- "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045],
- "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849],
- "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742],
- "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092],
- "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568],
- "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116],
- "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567],
- "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707],
- "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864],
- "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849],
- "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715],
- "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767],
- "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935],
- "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412],
- "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197],
- "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335],
"VMASKMOVPDYmr":[-0.04878474771976471, 0.009688055142760277, 0.05428608879446983, -0.030850162729620934, 0.03008297272026539, 0.03831377625465393, -0.023454757407307625, 0.061062078922986984, -0.07177434861660004, 0.003681673901155591, 0.040161218494176865, -0.009652352891862392, 0.07261710613965988, -0.010966332629323006, -0.013221205212175846, -0.03301544487476349, 0.04829031974077225, -0.08083753287792206, 0.030231673270463943, -0.02659734897315502, -0.036777157336473465, 0.06681652367115021, 0.01175805926322937, 0.06305940449237823, -0.019296150654554367, 0.02796877548098564, -0.029999401420354843, -0.0198240764439106, -0.04471949115395546, -0.06781838089227676, 0.024380704388022423, 0.03754236921668053, 0.06767786294221878, 0.04803696274757385, 0.046649131923913956, 0.04538867995142937, -0.028629129752516747, 0.0127564687281847, 0.004995361436158419, -0.08728974312543869, 0.029057662934064865, 0.07067801058292389, 0.0007887053652666509, 0.019237162545323372, -0.04447153955698013, -0.10583364218473434, 0.08983936905860901, 0.015038984827697277, -0.034384895116090775, -0.055098336189985275, -0.07670909911394119, 0.002524072304368019, 0.10086455941200256, 0.022610867395997047, 0.05591642111539841, -0.07907918840646744, -0.04253252223134041, 0.05387851223349571, -0.034182146191596985, -0.08478306978940964, -0.039358172565698624, 0.05872701108455658, 0.0004980096709914505, -0.054017916321754456, -0.05543661117553711, -0.05234605073928833, -0.01648441143333912, -0.039598412811756134, 0.014009279198944569, 0.07753992825746536, -0.024791967123746872, 0.0015941763995215297, -0.08564147353172302, 0.015439499169588089, 0.04659571126103401, 0.042471837252378464, 0.005456998012959957, 0.015990061685442924, -0.02272135764360428, -0.03891618177294731, -0.0077924951910972595, -0.05113787576556206, 0.040118955075740814, -0.043831776827573776, 0.05283576622605324, 0.09104584157466888, 0.015506122261285782, -0.028880758211016655, -0.0025508899707347155, 0.08238258212804794, -0.011219828389585018, 0.0496247261762619, -0.044287387281656265, 0.050674524158239365, 0.02936738170683384, -0.017218898981809616, 0.07722929865121841, 0.04578819498419762, -0.031120644882321358, -0.022032534703612328],
"VMASKMOVPSYmr":[0.020578626543283463, -0.004085692577064037, 0.07696651667356491, 0.028803450986742973, -0.006955036427825689, -0.018540993332862854, 0.0719260424375534, 0.09322775900363922, 0.05095001682639122, -0.01811334490776062, 0.01627892442047596, 0.050088733434677124, -0.06736274808645248, 0.025077303871512413, 0.06022811681032181, -0.09305489808320999, -0.09338469058275223, -0.0525103323161602, -0.06159364432096481, 0.030921749770641327, 0.06632588058710098, 0.031169326975941658, 0.016549210995435715, -0.06410345435142517, 0.034944821149110794, 0.01632581278681755, 0.06805131584405899, -0.004622941836714745, -0.02994105964899063, 0.025459013879299164, 0.020487098023295403, 0.06677251309156418, -0.046148937195539474, -0.05847230181097984, -0.0662517175078392, -0.006552667822688818, 0.05338975414633751, -0.07456435263156891, -0.05682503432035446, -0.0720917284488678, -0.08354304730892181, 0.057539310306310654, -0.0984572172164917, -0.015717046335339546, -0.04905203357338905, -0.016580646857619286, 0.030063051730394363, -0.04245767742395401, -0.019089849665760994, 0.037014883011579514, -0.03125334531068802, -0.02194075658917427, 0.057924628257751465, 0.053156934678554535, 0.03154401481151581, -0.03698640316724777, -0.047283731400966644, -0.07787752151489258, -0.09294760227203369, 0.008879968896508217, -0.039479922503232956, 0.06407082825899124, 0.021868228912353516, 0.02621234394609928, -0.05872492864727974, -0.07943505048751831, 0.024682780727744102, 0.014713538810610771, 0.02206231839954853, -0.0664556622505188, -0.08985312283039093, -0.028045928105711937, 0.022865260019898415, -0.03564520925283432, 0.06292934715747833, 0.009946631267666817, 0.031550049781799316, -0.08577742427587509, 0.047102898359298706, -0.07018786668777466, -0.10670997202396393, 0.0016501775244250894, -0.08505392074584961, 0.00861909706145525, -0.06370823830366135, 0.03423767164349556, 0.03173772618174553, 0.019602738320827484, 0.021573755890130997, 0.02385428547859192, -0.01468846295028925, 0.023825718089938164, -0.05538937821984291, 0.05968264490365982, 0.08997872471809387, -0.006320557557046413, 0.012793052941560745, -0.10326020419597626, -0.015349009074270725, 0.006139614153653383],
"VMASKMOVPSYrm":[-0.09999474138021469, -0.05461611971259117, 0.06111544370651245, 0.009340068325400352, 0.05158305540680885, 0.018409717828035355, 0.03258055821061134, -0.0017857305938377976, 0.041260261088609695, -0.04183795303106308, -0.04711655154824257, 0.007005605846643448, 0.017177876085042953, 0.011972760781645775, -0.058734532445669174, 0.022736912593245506, -0.10794606059789658, 0.029367392882704735, -0.012645614333450794, -0.09590506553649902, -0.07090207934379578, -0.05850019305944443, -0.018024247139692307, -0.0036007456947118044, -0.06459654122591019, 0.009839186444878578, 0.04846305027604103, -0.11106285452842712, 0.029033005237579346, 0.10009876638650894, 0.012796668335795403, -0.0073439814150333405, -0.08754748106002808, -0.037603843957185745, -0.015900349244475365, -0.007158457301557064, 0.03420218825340271, 0.027995899319648743, -0.07699259370565414, 0.042778756469488144, 0.04648644104599953, -0.04391217231750488, -0.018405593931674957, -0.01280362717807293, 0.08530068397521973, -0.03674551844596863, 0.06248623505234718, 0.0038591010961681604, -0.07031620293855667, -0.01702764257788658, 0.005379523150622845, -0.029414091259241104, 0.00011999297566944733, 0.058016858994960785, 0.10091454535722733, 0.07112561911344528, -0.07445680350065231, -0.08252609521150589, -0.05458306148648262, 0.0828995481133461, 0.030287114903330803, 0.08512170612812042, -0.0745752677321434, 0.011145705357193947, 0.07730960845947266, 0.06756677478551865, 0.10192125290632248, -0.015338120050728321, 0.025173967704176903, -0.017697714269161224, 0.00455897580832243, -0.01002852339297533, -0.09001599997282028, 0.06024448946118355, 0.01357717253267765, -0.04349803552031517, 0.026919689029455185, 0.07871785014867783, 0.06163106486201286, -0.02904645912349224, 0.05042176693677902, 0.019180594012141228, -0.029065869748592377, -0.02645217627286911, -0.04180121049284935, -0.01644887775182724, 0.005278781522065401, 0.021325504407286644, 0.0710480809211731, -0.02405066229403019, 0.06883849203586578, -0.08493685722351074, -0.0180019810795784, 0.10276532918214798, -0.04697193205356598, -0.0004998040967620909, 0.014400942251086235, 0.07172509282827377, 0.027445673942565918, 0.04722077399492264],
@@ -673,5 +642,42 @@
"V_SETALLONES":[0.011805560439825058, 0.005605545360594988, 0.019577916711568832, -0.007038246374577284, -0.013101942837238312, -0.060087915509939194, 0.06600171327590942, 0.1127510741353035, 0.03251935541629791, -0.08513955771923065, -0.1272188425064087, -0.05743984133005142, 0.03415455296635628, -0.01813715696334839, 0.08123213797807693, -0.02604430541396141, 0.004977638833224773, -0.05056260898709297, 0.0759609192609787, -0.03905864432454109, -0.029284782707691193, -0.0773778036236763, -0.06391929090023041, 0.03013690747320652, 0.025567403063178062, -0.04096659645438194, -0.013911372050642967, 0.03076753579080105, 0.09287972748279572, 0.06516721844673157, 0.013303481042385101, -0.05148301273584366, 0.013247961178421974, -0.02087739109992981, -0.06532798707485199, -0.07080436497926712, 0.03797996789216995, -0.05954182893037796, -0.006158157251775265, -0.039611611515283585, 0.016250262036919594, -0.009441757574677467, -0.009183786809444427, 0.16159473359584808, 0.08712765574455261, -0.022884182631969452, -0.03575573116540909, -0.03199240192770958, -0.03306444734334946, -0.003918874077498913, 0.062194518744945526, 0.015179269947111607, -0.027334710583090782, -0.058873455971479416, 0.128275528550148, -0.0292880367487669, -0.07747887820005417, 0.1131230816245079, 0.02434738725423813, -0.025987306609749794, 0.006977062206715345, 0.005061171483248472, 0.010551988147199154, -0.011694980785250664, -0.04222672060132027, 0.0018857514951378107, -0.09771532565355301, 0.005980918649584055, -0.021874738857150078, -0.03269551321864128, -0.0660959854722023, -0.03511122986674309, -0.012204808183014393, -0.010394910350441933, 0.05620425567030907, -0.07928325980901718, 0.0231300238519907, -0.018796175718307495, -0.059483520686626434, -0.06498315185308456, -0.002720780670642853, 0.017449399456381798, -0.07902888208627701, -0.09885134547948837, 0.013462111353874207, 0.0991656631231308, 0.03312922269105911, -0.006249894388020039, 0.005173753947019577, -0.06332565099000931, -0.06398826092481613, -0.03855561092495918, 0.049685269594192505, 0.016197331249713898, -0.006844596937298775, -0.05894636735320091, 0.026065604761242867, -0.023921040818095207, 0.0833858922123909, 0.04180749133229256],
"XCHG":[0.03013892099261284, -0.005918541457504034, -0.003877029987052083, -0.01153622567653656, 0.07044235616922379, 0.0020885420963168144, -0.04268760234117508, 0.07963797450065613, 0.0896378755569458, -0.03346250206232071, -0.026062551885843277, 0.07721738517284393, 0.08893758058547974, 0.0798523873090744, -0.025333784520626068, -0.01930663175880909, -0.012997916899621487, -0.051225848495960236, -0.0299966000020504, -0.032841041684150696, -0.06343690305948257, -0.016547048464417458, 0.034530773758888245, 0.057199425995349884, 0.0693645030260086, 0.04208416864275932, -0.028830133378505707, 0.08431533724069595, -0.06464798003435135, 0.0009512414690107107, 0.042868468910455704, -0.031348757445812225, -0.01816270686686039, 0.05597987025976181, -0.017707090824842453, -0.03889893740415573, -0.052769940346479416, 0.012921033427119255, -0.029488561674952507, -0.012502696365118027, 0.05398940294981003, -0.032147347927093506, -0.005250571761280298, -0.014250441454350948, 0.08205590397119522, 0.049281857907772064, -0.07257362455129623, -0.0003973407146986574, -0.00821124017238617, 0.10007432103157043, 0.054469816386699677, -0.05644146353006363, 0.013105852529406548, -0.08262810856103897, -0.02594495750963688, 0.007682343479245901, -0.011262120679020882, -0.007376475725322962, -0.05011703073978424, -0.06952987611293793, -0.033738043159246445, 0.01750120520591736, -0.026767224073410034, -0.04718783125281334, 0.002559647196903825, 0.01700885407626629, -0.07193762063980103, 0.07015261799097061, 0.0034866048954427242, -0.08257746696472168, -0.07703307271003723, 0.006709580775350332, 0.06423933804035187, 0.024792056530714035, -0.008637255057692528, 0.0364011712372303, 0.035330090671777725, -0.060980167239904404, 0.026977067813277245, -0.02813805267214775, -0.02690977416932583, 0.05637027323246002, 0.008040377870202065, -0.03371180593967438, -0.06654872000217438, -0.030922764912247658, -0.07050447911024094, 0.047597192227840424, 0.047301240265369415, 0.04565070942044258, -0.0005885852151550353, -0.01970672234892845, -0.013277091085910797, 0.03462797775864601, -0.050644565373659134, -6.830461643403396e-05, -0.0032834408339112997, -0.09096988290548325, -0.0431605726480484, 0.004180085379630327],
"XOR":[0.05397406592965126, 0.030059566721320152, -0.008174624294042587, -0.015902524814009666, -0.05867229402065277, 0.10023067146539688, 0.039013586938381195, -0.0062194764614105225, 0.0027951474767178297, -0.12871405482292175, 0.006182669661939144, -0.03362947702407837, 0.03972288593649864, -0.0761077031493187, 0.07198456674814224, 0.06330277770757675, -0.020690103992819786, 0.04084693267941475, -0.029953323304653168, -0.1037738174200058, 0.058683767914772034, -0.09326515346765518, -0.030509043484926224, 0.08620086312294006, -0.028335779905319214, 0.0025649559684097767, 0.02293877862393856, 0.06309233605861664, 0.05537085980176926, 0.008650199510157108, 0.08450134843587875, 0.006163342390209436, 0.08676894754171371, 0.00373055599629879, -0.0536164715886116, 0.017478466033935547, -0.02005663886666298, -0.009954672306776047, 0.0935724526643753, -0.013202485628426075, 0.019175032153725624, 0.047811202704906464, -0.010279017500579357, 0.08613553643226624, 0.030951783061027527, -0.007498149760067463, 0.02222890406847, 0.022576699033379555, -0.037464242428541183, -0.05039561539888382, -0.05145428702235222, 0.05291113257408142, -0.04549814388155937, 0.07552238553762436, 0.04320567473769188, 0.08343681693077087, -0.03850278630852699, -0.01834949105978012, 0.047886237502098083, 0.00965320598334074, 0.014898041263222694, -0.06947735697031021, -0.002480468712747097, 0.033667247742414474, -0.057668499648571014, 0.038462892174720764, -0.04644528403878212, -0.06664751470088959, -0.048734813928604126, 0.04303475841879845, 0.027636554092168808, 0.024116700515151024, -0.003788548056036234, -0.0088395019993186, -0.04236738011240959, -0.02894027903676033, -0.135579451918602, -0.032144784927368164, -0.11316774785518646, -0.0039872839115560055, 0.07162772864103317, 0.03945969045162201, 0.007661669049412012, 0.04564569517970085, 0.023007070645689964, 0.0002026051515713334, -0.030437719076871872, -0.01982058770954609, -0.017619898542761803, -0.04013601690530777, 0.03464880958199501, -0.04437020793557167, 0.010373799130320549, -0.057255037128925323, -0.006371108815073967, -0.02713695913553238, -0.06605585664510727, 0.01780680939555168, -0.00013575045159086585, 0.07283638417720795]
+ },
+ "CommonOperands" : {
+ "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983],
+ "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257],
+ "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805],
+ "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232],
+ "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772],
+ "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833],
+ "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893],
+ "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944],
+ "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983],
+ "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583]
+ },
+ "VirtualRegisters" : {
+ "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894],
+ "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718],
+ "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234],
+ "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654],
+ "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045],
+ "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849],
+ "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742],
+ "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092],
+ "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568],
+ "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116],
+ "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567],
+ "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707],
+ "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864],
+ "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849],
+ "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715],
+ "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767],
+ "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935],
+ "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412],
+ "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197],
+ "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335]
+ },
+ "PhysicalRegisters" : {
+ "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063]
}
} \ No newline at end of file
diff --git a/llvm/lib/AsmParser/AsmParserContext.cpp b/llvm/lib/AsmParser/AsmParserContext.cpp
new file mode 100644
index 0000000..59d3ffc
--- /dev/null
+++ b/llvm/lib/AsmParser/AsmParserContext.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AsmParser/AsmParserContext.h"
+
+namespace llvm {
+
+std::optional<FileLocRange>
+AsmParserContext::getFunctionLocation(const Function *F) const {
+ if (auto FIt = Functions.find(F); FIt != Functions.end())
+ return FIt->second;
+ return std::nullopt;
+}
+
+std::optional<FileLocRange>
+AsmParserContext::getBlockLocation(const BasicBlock *BB) const {
+ if (auto BBIt = Blocks.find(BB); BBIt != Blocks.end())
+ return BBIt->second;
+ return std::nullopt;
+}
+
+std::optional<FileLocRange>
+AsmParserContext::getInstructionLocation(const Instruction *I) const {
+ if (auto IIt = Instructions.find(I); IIt != Instructions.end())
+ return IIt->second;
+ return std::nullopt;
+}
+
+Function *
+AsmParserContext::getFunctionAtLocation(const FileLocRange &Query) const {
+ for (auto &[F, Loc] : Functions) {
+ if (Loc.contains(Query))
+ return F;
+ }
+ return nullptr;
+}
+
+Function *AsmParserContext::getFunctionAtLocation(const FileLoc &Query) const {
+ return getFunctionAtLocation(FileLocRange(Query, Query));
+}
+
+BasicBlock *
+AsmParserContext::getBlockAtLocation(const FileLocRange &Query) const {
+ for (auto &[BB, Loc] : Blocks) {
+ if (Loc.contains(Query))
+ return BB;
+ }
+ return nullptr;
+}
+
+BasicBlock *AsmParserContext::getBlockAtLocation(const FileLoc &Query) const {
+ return getBlockAtLocation(FileLocRange(Query, Query));
+}
+
+Instruction *
+AsmParserContext::getInstructionAtLocation(const FileLocRange &Query) const {
+ for (auto &[I, Loc] : Instructions) {
+ if (Loc.contains(Query))
+ return I;
+ }
+ return nullptr;
+}
+
+Instruction *
+AsmParserContext::getInstructionAtLocation(const FileLoc &Query) const {
+ return getInstructionAtLocation(FileLocRange(Query, Query));
+}
+
+bool AsmParserContext::addFunctionLocation(Function *F,
+ const FileLocRange &Loc) {
+ return Functions.insert({F, Loc}).second;
+}
+
+bool AsmParserContext::addBlockLocation(BasicBlock *BB,
+ const FileLocRange &Loc) {
+ return Blocks.insert({BB, Loc}).second;
+}
+
+bool AsmParserContext::addInstructionLocation(Instruction *I,
+ const FileLocRange &Loc) {
+ return Instructions.insert({I, Loc}).second;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/AsmParser/CMakeLists.txt b/llvm/lib/AsmParser/CMakeLists.txt
index 20d0c50..dcfcc06 100644
--- a/llvm/lib/AsmParser/CMakeLists.txt
+++ b/llvm/lib/AsmParser/CMakeLists.txt
@@ -1,5 +1,6 @@
# AsmParser
add_llvm_component_library(LLVMAsmParser
+ AsmParserContext.cpp
LLLexer.cpp
LLParser.cpp
Parser.cpp
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 50d1d47..7a6c19e 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -191,6 +191,8 @@ int LLLexer::getNextChar() {
}
lltok::Kind LLLexer::LexToken() {
+ // Set token end to next location, since the end is exclusive.
+ PrevTokEnd = CurPtr;
while (true) {
TokStart = CurPtr;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index f71a534..5164cec 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -752,14 +752,21 @@ bool LLParser::parseDeclare() {
/// ::= 'define' FunctionHeader (!dbg !56)* '{' ...
bool LLParser::parseDefine() {
assert(Lex.getKind() == lltok::kw_define);
+ FileLoc FunctionStart(Lex.getTokLineColumnPos());
Lex.Lex();
Function *F;
unsigned FunctionNumber = -1;
SmallVector<unsigned> UnnamedArgNums;
- return parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) ||
- parseOptionalFunctionMetadata(*F) ||
- parseFunctionBody(*F, FunctionNumber, UnnamedArgNums);
+ bool RetValue =
+ parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) ||
+ parseOptionalFunctionMetadata(*F) ||
+ parseFunctionBody(*F, FunctionNumber, UnnamedArgNums);
+ if (ParserContext)
+ ParserContext->addFunctionLocation(
+ F, FileLocRange(FunctionStart, Lex.getPrevTokEndLineColumnPos()));
+
+ return RetValue;
}
/// parseGlobalType
@@ -7018,6 +7025,8 @@ bool LLParser::parseFunctionBody(Function &Fn, unsigned FunctionNumber,
/// parseBasicBlock
/// ::= (LabelStr|LabelID)? Instruction*
bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
+ FileLoc BBStart(Lex.getTokLineColumnPos());
+
// If this basic block starts out with a name, remember it.
std::string Name;
int NameID = -1;
@@ -7059,6 +7068,7 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
TrailingDbgRecord.emplace_back(DR, DeleteDbgRecord);
}
+ FileLoc InstStart(Lex.getTokLineColumnPos());
// This instruction may have three possibilities for a name: a) none
// specified, b) name specified "%foo =", c) number specified: "%4 =".
LocTy NameLoc = Lex.getLoc();
@@ -7108,8 +7118,16 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
for (DbgRecordPtr &DR : TrailingDbgRecord)
BB->insertDbgRecordBefore(DR.release(), Inst->getIterator());
TrailingDbgRecord.clear();
+ if (ParserContext) {
+ ParserContext->addInstructionLocation(
+ Inst, FileLocRange(InstStart, Lex.getPrevTokEndLineColumnPos()));
+ }
} while (!Inst->isTerminator());
+ if (ParserContext)
+ ParserContext->addBlockLocation(
+ BB, FileLocRange(BBStart, Lex.getPrevTokEndLineColumnPos()));
+
assert(TrailingDbgRecord.empty() &&
"All debug values should have been attached to an instruction.");
diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 07fdce9..c5346d0 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -24,33 +24,38 @@ using namespace llvm;
static bool parseAssemblyInto(MemoryBufferRef F, Module *M,
ModuleSummaryIndex *Index, SMDiagnostic &Err,
SlotMapping *Slots, bool UpgradeDebugInfo,
- DataLayoutCallbackTy DataLayoutCallback) {
+ DataLayoutCallbackTy DataLayoutCallback,
+ AsmParserContext *ParserContext = nullptr) {
SourceMgr SM;
std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
std::optional<LLVMContext> OptContext;
return LLParser(F.getBuffer(), SM, Err, M, Index,
- M ? M->getContext() : OptContext.emplace(), Slots)
+ M ? M->getContext() : OptContext.emplace(), Slots,
+ ParserContext)
.Run(UpgradeDebugInfo, DataLayoutCallback);
}
bool llvm::parseAssemblyInto(MemoryBufferRef F, Module *M,
ModuleSummaryIndex *Index, SMDiagnostic &Err,
SlotMapping *Slots,
- DataLayoutCallbackTy DataLayoutCallback) {
+ DataLayoutCallbackTy DataLayoutCallback,
+ AsmParserContext *ParserContext) {
return ::parseAssemblyInto(F, M, Index, Err, Slots,
- /*UpgradeDebugInfo*/ true, DataLayoutCallback);
+ /*UpgradeDebugInfo*/ true, DataLayoutCallback,
+ ParserContext);
}
std::unique_ptr<Module>
llvm::parseAssembly(MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
- SlotMapping *Slots,
- DataLayoutCallbackTy DataLayoutCallback) {
+ SlotMapping *Slots, DataLayoutCallbackTy DataLayoutCallback,
+ AsmParserContext *ParserContext) {
std::unique_ptr<Module> M =
std::make_unique<Module>(F.getBufferIdentifier(), Context);
- if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback))
+ if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback,
+ ParserContext))
return nullptr;
return M;
@@ -133,12 +138,14 @@ ParsedModuleAndIndex llvm::parseAssemblyFileWithIndexNoUpgradeDebugInfo(
DataLayoutCallback);
}
-std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString,
- SMDiagnostic &Err,
- LLVMContext &Context,
- SlotMapping *Slots) {
+std::unique_ptr<Module>
+llvm::parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
+ LLVMContext &Context, SlotMapping *Slots,
+ AsmParserContext *ParserContext) {
MemoryBufferRef F(AsmString, "<string>");
- return parseAssembly(F, Err, Context, Slots);
+ return parseAssembly(
+ F, Err, Context, Slots, [](StringRef, StringRef) { return std::nullopt; },
+ ParserContext);
}
static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F,
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index cf7efbfa..466dcb0 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8603,7 +8603,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
case bitc::FS_FLAGS: { // [flags]
uint64_t Flags = Record[0];
// Scan flags.
- assert(Flags <= 0x2ff && "Unexpected bits in flag");
+ assert(Flags <= 0x7ff && "Unexpected bits in flag");
bool EnableSplitLTOUnit = Flags & 0x8;
bool UnifiedLTO = Flags & 0x200;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 9288d7e..9c0b68b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -334,7 +334,7 @@ public:
const DIE &TyDIE);
protected:
- ~DwarfUnit();
+ ~DwarfUnit() override;
/// Create new static data member DIE.
DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index 2cba6f0..0665437 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -62,7 +62,7 @@ public:
static std::unique_ptr<WorkListMaintainer>
create(Level Lvl, WorkListTy &WorkList, MachineRegisterInfo &MRI);
- virtual ~WorkListMaintainer() = default;
+ ~WorkListMaintainer() override = default;
void reportFullyCreatedInstrs() {
LLVM_DEBUG({
@@ -95,7 +95,7 @@ public:
WorkListMaintainerImpl(WorkListTy &WorkList, MachineRegisterInfo &MRI)
: WorkList(WorkList), MRI(MRI) {}
- virtual ~WorkListMaintainerImpl() = default;
+ ~WorkListMaintainerImpl() override = default;
void reset() override {
DeferList.clear();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 884c3f1..1fe38d6 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -139,7 +139,7 @@ class DILocationVerifier : public GISelChangeObserver {
public:
DILocationVerifier() = default;
- ~DILocationVerifier() = default;
+ ~DILocationVerifier() override = default;
const Instruction *getCurrentInst() const { return CurrInst; }
void setCurrentInst(const Instruction *Inst) { CurrInst = Inst; }
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
index b655375..94e3a82 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -69,7 +69,7 @@ public:
static char ID;
LiveDebugValuesLegacy();
- ~LiveDebugValuesLegacy() = default;
+ ~LiveDebugValuesLegacy() override = default;
/// Calculate the liveness information for the given machine function.
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index b9ea03f..1c4b2f9 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -1094,7 +1094,7 @@ public:
/// Default construct and initialize the pass.
VarLocBasedLDV();
- ~VarLocBasedLDV();
+ ~VarLocBasedLDV() override;
/// Print to ostream with a message.
void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 99be1fc0..75ca06a 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -42,6 +42,13 @@ static cl::opt<std::string>
cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
cl::desc("Weight for machine opcode embeddings"),
cl::cat(MIR2VecCategory));
+cl::opt<float> CommonOperandWeight(
+ "mir2vec-common-operand-weight", cl::Optional, cl::init(1.0),
+ cl::desc("Weight for common operand embeddings"), cl::cat(MIR2VecCategory));
+cl::opt<float>
+ RegOperandWeight("mir2vec-reg-operand-weight", cl::Optional, cl::init(1.0),
+ cl::desc("Weight for register operand embeddings"),
+ cl::cat(MIR2VecCategory));
cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
"mir2vec-kind", cl::Optional,
cl::values(clEnumValN(MIR2VecKind::Symbolic, "symbolic",
@@ -56,26 +63,52 @@ cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
// Vocabulary
//===----------------------------------------------------------------------===//
-MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
- const TargetInstrInfo &TII)
- : TII(TII) {
+MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap,
+ VocabMap &&PhysicalRegisterMap,
+ VocabMap &&VirtualRegisterMap,
+ const TargetInstrInfo &TII,
+ const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI)
+ : TII(TII), TRI(TRI), MRI(MRI) {
buildCanonicalOpcodeMapping();
-
unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size();
assert(CanonicalOpcodeCount > 0 &&
"No canonical opcodes found for target - invalid vocabulary");
- Layout.OperandBase = CanonicalOpcodeCount;
- generateStorage(OpcodeEntries);
+
+ buildRegisterOperandMapping();
+
+ // Define layout of vocabulary sections
+ Layout.OpcodeBase = 0;
+ Layout.CommonOperandBase = CanonicalOpcodeCount;
+ // We expect same classes for physical and virtual registers
+ Layout.PhyRegBase = Layout.CommonOperandBase + std::size(CommonOperandNames);
+ Layout.VirtRegBase = Layout.PhyRegBase + RegisterOperandNames.size();
+
+ generateStorage(OpcodeMap, CommonOperandMap, PhysicalRegisterMap,
+ VirtualRegisterMap);
Layout.TotalEntries = Storage.size();
}
-Expected<MIRVocabulary> MIRVocabulary::create(VocabMap &&Entries,
- const TargetInstrInfo &TII) {
- if (Entries.empty())
+Expected<MIRVocabulary>
+MIRVocabulary::create(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap,
+ VocabMap &&PhyRegMap, VocabMap &&VirtRegMap,
+ const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI) {
+ if (OpcodeMap.empty() || CommonOperandMap.empty() || PhyRegMap.empty() ||
+ VirtRegMap.empty())
return createStringError(errc::invalid_argument,
"Empty vocabulary entries provided");
- return MIRVocabulary(std::move(Entries), TII);
+ MIRVocabulary Vocab(std::move(OpcodeMap), std::move(CommonOperandMap),
+ std::move(PhyRegMap), std::move(VirtRegMap), TII, TRI,
+ MRI);
+
+ // Validate Storage after construction
+ if (!Vocab.Storage.isValid())
+ return createStringError(errc::invalid_argument,
+ "Failed to create valid vocabulary storage");
+ Vocab.ZeroEmbedding = Embedding(Vocab.Storage.getDimension(), 0.0);
+ return std::move(Vocab);
}
std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) {
@@ -122,22 +155,74 @@ unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const {
return getCanonicalIndexForBaseName(BaseOpcode);
}
+unsigned
+MIRVocabulary::getCanonicalIndexForOperandName(StringRef OperandName) const {
+ auto It = std::find(std::begin(CommonOperandNames),
+ std::end(CommonOperandNames), OperandName);
+ assert(It != std::end(CommonOperandNames) &&
+ "Operand name not found in common operands");
+ return Layout.CommonOperandBase +
+ std::distance(std::begin(CommonOperandNames), It);
+}
+
+unsigned
+MIRVocabulary::getCanonicalIndexForRegisterClass(StringRef RegName,
+ bool IsPhysical) const {
+ auto It = std::find(RegisterOperandNames.begin(), RegisterOperandNames.end(),
+ RegName);
+ assert(It != RegisterOperandNames.end() &&
+ "Register name not found in register operands");
+ unsigned LocalIndex = std::distance(RegisterOperandNames.begin(), It);
+ return (IsPhysical ? Layout.PhyRegBase : Layout.VirtRegBase) + LocalIndex;
+}
+
std::string MIRVocabulary::getStringKey(unsigned Pos) const {
assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary");
- // For now, all entries are opcodes since we only have one section
- if (Pos < Layout.OperandBase && Pos < UniqueBaseOpcodeNames.size()) {
+ // Handle opcodes section
+ if (Pos < Layout.CommonOperandBase) {
// Convert canonical index back to base opcode name
auto It = UniqueBaseOpcodeNames.begin();
std::advance(It, Pos);
+ assert(It != UniqueBaseOpcodeNames.end() &&
+ "Canonical index out of bounds in opcode section");
return *It;
}
- llvm_unreachable("Invalid position in vocabulary");
- return "";
+ auto getLocalIndex = [](unsigned Pos, size_t BaseOffset, size_t Bound,
+ const char *Msg) {
+ unsigned LocalIndex = Pos - BaseOffset;
+ assert(LocalIndex < Bound && Msg);
+ return LocalIndex;
+ };
+
+ // Handle common operands section
+ if (Pos < Layout.PhyRegBase) {
+ unsigned LocalIndex = getLocalIndex(
+ Pos, Layout.CommonOperandBase, std::size(CommonOperandNames),
+ "Local index out of bounds in common operands");
+ return CommonOperandNames[LocalIndex].str();
+ }
+
+ // Handle physical registers section
+ if (Pos < Layout.VirtRegBase) {
+ unsigned LocalIndex =
+ getLocalIndex(Pos, Layout.PhyRegBase, RegisterOperandNames.size(),
+ "Local index out of bounds in physical registers");
+ return "PhyReg_" + RegisterOperandNames[LocalIndex];
+ }
+
+ // Handle virtual registers section
+ unsigned LocalIndex =
+ getLocalIndex(Pos, Layout.VirtRegBase, RegisterOperandNames.size(),
+ "Local index out of bounds in virtual registers");
+ return "VirtReg_" + RegisterOperandNames[LocalIndex];
}
-void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
+void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap,
+ const VocabMap &CommonOperandsMap,
+ const VocabMap &PhyRegMap,
+ const VocabMap &VirtRegMap) {
// Helper for handling missing entities in the vocabulary.
// Currently, we use a zero vector. In the future, we will throw an error to
@@ -151,14 +236,14 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
// Initialize opcode embeddings section
unsigned EmbeddingDim = OpcodeMap.begin()->second.size();
- std::vector<Embedding> OpcodeEmbeddings(Layout.OperandBase,
+ std::vector<Embedding> OpcodeEmbeddings(Layout.CommonOperandBase,
Embedding(EmbeddingDim));
// Populate opcode embeddings using canonical mapping
for (auto COpcodeName : UniqueBaseOpcodeNames) {
if (auto It = OpcodeMap.find(COpcodeName); It != OpcodeMap.end()) {
auto COpcodeIndex = getCanonicalIndexForBaseName(COpcodeName);
- assert(COpcodeIndex < Layout.OperandBase &&
+ assert(COpcodeIndex < Layout.CommonOperandBase &&
"Canonical index out of bounds");
OpcodeEmbeddings[COpcodeIndex] = It->second;
} else {
@@ -166,8 +251,39 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
}
}
- // TODO: Add operand/argument embeddings as additional sections
- // This will require extending the vocabulary format and layout
+ // Initialize common operand embeddings section
+ std::vector<Embedding> CommonOperandEmbeddings(std::size(CommonOperandNames),
+ Embedding(EmbeddingDim));
+ unsigned OperandIndex = 0;
+ for (const auto &CommonOperandName : CommonOperandNames) {
+ if (auto It = CommonOperandsMap.find(CommonOperandName.str());
+ It != CommonOperandsMap.end()) {
+ CommonOperandEmbeddings[OperandIndex] = It->second;
+ } else {
+ handleMissingEntity(CommonOperandName);
+ }
+ ++OperandIndex;
+ }
+
+ // Helper lambda for creating register operand embeddings
+ auto createRegisterEmbeddings = [&](const VocabMap &RegMap) {
+ std::vector<Embedding> RegEmbeddings(TRI.getNumRegClasses(),
+ Embedding(EmbeddingDim));
+ unsigned RegOperandIndex = 0;
+ for (const auto &RegOperandName : RegisterOperandNames) {
+ if (auto It = RegMap.find(RegOperandName); It != RegMap.end())
+ RegEmbeddings[RegOperandIndex] = It->second;
+ else
+ handleMissingEntity(RegOperandName);
+ ++RegOperandIndex;
+ }
+ return RegEmbeddings;
+ };
+
+ // Initialize register operand embeddings sections
+ std::vector<Embedding> PhyRegEmbeddings = createRegisterEmbeddings(PhyRegMap);
+ std::vector<Embedding> VirtRegEmbeddings =
+ createRegisterEmbeddings(VirtRegMap);
// Scale the vocabulary sections based on the provided weights
auto scaleVocabSection = [](std::vector<Embedding> &Embeddings,
@@ -176,9 +292,20 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
Embedding *= Weight;
};
scaleVocabSection(OpcodeEmbeddings, OpcWeight);
-
- std::vector<std::vector<Embedding>> Sections(1);
- Sections[0] = std::move(OpcodeEmbeddings);
+ scaleVocabSection(CommonOperandEmbeddings, CommonOperandWeight);
+ scaleVocabSection(PhyRegEmbeddings, RegOperandWeight);
+ scaleVocabSection(VirtRegEmbeddings, RegOperandWeight);
+
+ std::vector<std::vector<Embedding>> Sections(
+ static_cast<unsigned>(Section::MaxSections));
+ Sections[static_cast<unsigned>(Section::Opcodes)] =
+ std::move(OpcodeEmbeddings);
+ Sections[static_cast<unsigned>(Section::CommonOperands)] =
+ std::move(CommonOperandEmbeddings);
+ Sections[static_cast<unsigned>(Section::PhyRegisters)] =
+ std::move(PhyRegEmbeddings);
+ Sections[static_cast<unsigned>(Section::VirtRegisters)] =
+ std::move(VirtRegEmbeddings);
Storage = ir2vec::VocabStorage(std::move(Sections));
}
@@ -199,26 +326,94 @@ void MIRVocabulary::buildCanonicalOpcodeMapping() {
<< " unique base opcodes\n");
}
-Expected<MIRVocabulary>
-MIRVocabulary::createDummyVocabForTest(const TargetInstrInfo &TII,
- unsigned Dim) {
+void MIRVocabulary::buildRegisterOperandMapping() {
+ // Check if already built
+ if (!RegisterOperandNames.empty())
+ return;
+
+ for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) {
+ const TargetRegisterClass *RegClass = TRI.getRegClass(RC);
+ if (!RegClass)
+ continue;
+
+ // Get the register class name
+ StringRef ClassName = TRI.getRegClassName(RegClass);
+ RegisterOperandNames.push_back(ClassName.str());
+ }
+}
+
+unsigned MIRVocabulary::getCommonOperandIndex(
+ MachineOperand::MachineOperandType OperandType) const {
+ assert(OperandType != MachineOperand::MO_Register &&
+ "Expected non-register operand type");
+ assert(OperandType > MachineOperand::MO_Register &&
+ OperandType < MachineOperand::MO_Last && "Operand type out of bounds");
+ return static_cast<unsigned>(OperandType) - 1;
+}
+
+unsigned MIRVocabulary::getRegisterOperandIndex(Register Reg) const {
+ assert(!RegisterOperandNames.empty() && "Register operand mapping not built");
+ assert(Reg.isValid() && "Invalid register; not expected here");
+ assert((Reg.isPhysical() || Reg.isVirtual()) &&
+ "Expected a physical or virtual register");
+
+ const TargetRegisterClass *RegClass = nullptr;
+
+ // For physical registers, use TRI to get minimal register class as a
+ // physical register can belong to multiple classes. For virtual
+ // registers, use MRI to uniquely identify the assigned register class.
+ if (Reg.isPhysical())
+ RegClass = TRI.getMinimalPhysRegClass(Reg);
+ else
+ RegClass = MRI.getRegClass(Reg);
+
+ if (RegClass)
+ return RegClass->getID();
+ // Fallback for registers without a class (shouldn't happen)
+ llvm_unreachable("Register operand without a valid register class");
+ return 0;
+}
+
+Expected<MIRVocabulary> MIRVocabulary::createDummyVocabForTest(
+ const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI, unsigned Dim) {
assert(Dim > 0 && "Dimension must be greater than zero");
float DummyVal = 0.1f;
- // Create dummy embeddings for all canonical opcode names
- VocabMap DummyVocabMap;
+ VocabMap DummyOpcMap, DummyOperandMap, DummyPhyRegMap, DummyVirtRegMap;
+
+ // Process opcodes directly without creating temporary vocabulary
for (unsigned Opcode = 0; Opcode < TII.getNumOpcodes(); ++Opcode) {
std::string BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode));
- if (DummyVocabMap.count(BaseOpcode) == 0) {
- // Only add if not already present
- DummyVocabMap[BaseOpcode] = Embedding(Dim, DummyVal);
+ if (DummyOpcMap.count(BaseOpcode) == 0) { // Only add if not already present
+ DummyOpcMap[BaseOpcode] = Embedding(Dim, DummyVal);
DummyVal += 0.1f;
}
}
- // Create and return vocabulary with dummy embeddings
- return MIRVocabulary::create(std::move(DummyVocabMap), TII);
+ // Add common operands
+ for (const auto &CommonOperandName : CommonOperandNames) {
+ DummyOperandMap[CommonOperandName.str()] = Embedding(Dim, DummyVal);
+ DummyVal += 0.1f;
+ }
+
+ // Process register classes directly
+ for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) {
+ const TargetRegisterClass *RegClass = TRI.getRegClass(RC);
+ if (!RegClass)
+ continue;
+
+ std::string ClassName = TRI.getRegClassName(RegClass);
+ DummyPhyRegMap[ClassName] = Embedding(Dim, DummyVal);
+ DummyVirtRegMap[ClassName] = Embedding(Dim, DummyVal);
+ DummyVal += 0.1f;
+ }
+
+ // Create vocabulary directly without temporary instance
+ return MIRVocabulary::create(
+ std::move(DummyOpcMap), std::move(DummyOperandMap),
+ std::move(DummyPhyRegMap), std::move(DummyVirtRegMap), TII, TRI, MRI);
}
//===----------------------------------------------------------------------===//
@@ -236,9 +431,10 @@ StringRef MIR2VecVocabLegacyAnalysis::getPassName() const {
return "MIR2Vec Vocabulary Analysis";
}
-Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
- // TODO: Extend vocabulary format to support multiple sections
- // (opcodes, operands, etc.) similar to IR2Vec structure
+Error MIR2VecVocabLegacyAnalysis::readVocabulary(VocabMap &OpcodeVocab,
+ VocabMap &CommonOperandVocab,
+ VocabMap &PhyRegVocabMap,
+ VocabMap &VirtRegVocabMap) {
if (VocabFile.empty())
return createStringError(
errc::invalid_argument,
@@ -255,21 +451,47 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
if (!ParsedVocabValue)
return ParsedVocabValue.takeError();
- unsigned Dim = 0;
+ unsigned OpcodeDim = 0, CommonOperandDim = 0, PhyRegOperandDim = 0,
+ VirtRegOperandDim = 0;
+ if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+ "Opcodes", *ParsedVocabValue, OpcodeVocab, OpcodeDim))
+ return Err;
+
+ if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+ "CommonOperands", *ParsedVocabValue, CommonOperandVocab,
+ CommonOperandDim))
+ return Err;
+
+ if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+ "PhysicalRegisters", *ParsedVocabValue, PhyRegVocabMap,
+ PhyRegOperandDim))
+ return Err;
+
if (auto Err = ir2vec::VocabStorage::parseVocabSection(
- "entities", *ParsedVocabValue, StrVocabMap, Dim))
+ "VirtualRegisters", *ParsedVocabValue, VirtRegVocabMap,
+ VirtRegOperandDim))
return Err;
+ // All sections must have the same embedding dimension
+ if (!(OpcodeDim == CommonOperandDim && CommonOperandDim == PhyRegOperandDim &&
+ PhyRegOperandDim == VirtRegOperandDim)) {
+ return createStringError(
+ errc::illegal_byte_sequence,
+ "MIR2Vec vocabulary sections have different dimensions");
+ }
+
return Error::success();
}
Expected<mir2vec::MIRVocabulary>
MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
- if (StrVocabMap.empty()) {
- if (Error Err = readVocabulary()) {
- return std::move(Err);
- }
- }
+ if (Vocab.has_value())
+ return std::move(Vocab.value());
+
+ VocabMap OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+ if (Error Err =
+ readVocabulary(OpcMap, CommonOperandMap, PhyRegMap, VirtRegMap))
+ return std::move(Err);
// Get machine module info to access machine functions and target info
MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
@@ -280,8 +502,24 @@ MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
continue;
if (auto *MF = MMI.getMachineFunction(F)) {
- const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
- return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII);
+ auto &Subtarget = MF->getSubtarget();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ if (!TII) {
+ return createStringError(errc::invalid_argument,
+ "No TargetInstrInfo available; cannot create "
+ "MIR2Vec vocabulary");
+ }
+
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ if (!TRI) {
+ return createStringError(errc::invalid_argument,
+ "No TargetRegisterInfo available; cannot "
+ "create MIR2Vec vocabulary");
+ }
+
+ return mir2vec::MIRVocabulary::create(
+ std::move(OpcMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+ std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
}
}
@@ -351,9 +589,14 @@ Embedding SymbolicMIREmbedder::computeEmbeddings(const MachineInstr &MI) const {
if (MI.isDebugInstr())
return Embedding(Dimension, 0);
- // Todo: Add operand/argument contributions
+ // Opcode embedding
+ Embedding InstructionEmbedding = Vocab[MI.getOpcode()];
+
+ // Add operand contributions
+ for (const MachineOperand &MO : MI.operands())
+ InstructionEmbedding += Vocab[MO];
- return Vocab[MI.getOpcode()];
+ return InstructionEmbedding;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 1cb57a4..ba0b025 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1137,7 +1137,7 @@ public:
MF.setDelegate(this);
}
- ~SlotIndexUpdateDelegate() {
+ ~SlotIndexUpdateDelegate() override {
MF.resetDelegate(this);
for (auto MI : Insertions)
Indexes->insertMachineInstrInMaps(*MI);
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index e1d39d6..493d8df 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -196,7 +196,7 @@ public:
CopyRewriter(MachineInstr &MI) : Rewriter(MI) {
assert(MI.isCopy() && "Expected copy instruction");
}
- virtual ~CopyRewriter() = default;
+ ~CopyRewriter() override = default;
bool getNextRewritableSource(RegSubRegPair &Src,
RegSubRegPair &Dst) override {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 310d35d..d2ea652 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2476,16 +2476,17 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
/// masked vector operation if the target supports it.
static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
bool ShouldCommuteOperands) {
- // Match a select as operand 1. The identity constant that we are looking for
- // is only valid as operand 1 of a non-commutative binop.
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+
+ // Match a select as operand 1. The identity constant that we are looking for
+ // is only valid as operand 1 of a non-commutative binop.
if (ShouldCommuteOperands)
std::swap(N0, N1);
- unsigned SelOpcode = N1.getOpcode();
- if ((SelOpcode != ISD::VSELECT && SelOpcode != ISD::SELECT) ||
- !N1.hasOneUse())
+ SDValue Cond, TVal, FVal;
+ if (!sd_match(N1, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal),
+ m_Value(FVal)))))
return SDValue();
// We can't hoist all instructions because of immediate UB (not speculatable).
@@ -2493,11 +2494,9 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
if (!DAG.isSafeToSpeculativelyExecuteNode(N))
return SDValue();
+ unsigned SelOpcode = N1.getOpcode();
unsigned Opcode = N->getOpcode();
EVT VT = N->getValueType(0);
- SDValue Cond = N1.getOperand(0);
- SDValue TVal = N1.getOperand(1);
- SDValue FVal = N1.getOperand(2);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// This transform increases uses of N0, so freeze it to be safe.
@@ -13856,12 +13855,11 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
Opcode == ISD::ANY_EXTEND) &&
"Expected EXTEND dag node in input!");
- if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) ||
- !N0.hasOneUse())
+ SDValue Cond, Op1, Op2;
+ if (!sd_match(N0, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(Op1),
+ m_Value(Op2)))))
return SDValue();
- SDValue Op1 = N0->getOperand(1);
- SDValue Op2 = N0->getOperand(2);
if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode))
return SDValue();
@@ -13883,7 +13881,7 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1);
SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2);
- return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2);
+ return DAG.getSelect(DL, VT, Cond, Ext1, Ext2);
}
/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
@@ -17462,8 +17460,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// fold (fsub (fpext (fneg (fmul, x, y))), z)
// -> (fneg (fma (fpext x), (fpext y), z))
// Note: This could be removed with appropriate canonicalization of the
- // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
- // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+ // input expression into (fneg (fadd (fpext (fmul, x, y)), z)). However, the
+ // command line flag -fp-contract=fast and fast-math flag contract prevent
// from implementing the canonicalization in visitFSUB.
if (matcher.match(N0, ISD::FP_EXTEND)) {
SDValue N00 = N0.getOperand(0);
@@ -17487,7 +17485,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
// -> (fneg (fma (fpext x)), (fpext y), z)
// Note: This could be removed with appropriate canonicalization of the
// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
- // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+ // command line flag -fp-contract=fast and fast-math flag contract prevent
// from implementing the canonicalization in visitFSUB.
if (matcher.match(N0, ISD::FNEG)) {
SDValue N00 = N0.getOperand(0);
@@ -29620,13 +29618,14 @@ static SDValue takeInexpensiveLog2(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
}
// c ? X : Y -> c ? Log2(X) : Log2(Y)
- if ((Op.getOpcode() == ISD::SELECT || Op.getOpcode() == ISD::VSELECT) &&
- Op.hasOneUse()) {
- if (SDValue LogX = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(1),
- Depth + 1, AssumeNonZero))
- if (SDValue LogY = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(2),
- Depth + 1, AssumeNonZero))
- return DAG.getSelect(DL, VT, Op.getOperand(0), LogX, LogY);
+ SDValue Cond, TVal, FVal;
+ if (sd_match(Op, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal),
+ m_Value(FVal))))) {
+ if (SDValue LogX =
+ takeInexpensiveLog2(DAG, DL, VT, TVal, Depth + 1, AssumeNonZero))
+ if (SDValue LogY =
+ takeInexpensiveLog2(DAG, DL, VT, FVal, Depth + 1, AssumeNonZero))
+ return DAG.getSelect(DL, VT, Cond, LogX, LogY);
}
// log2(umin(X, Y)) -> umin(log2(X), log2(Y))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 20a0efd..dcf2df3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1977,8 +1977,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
Register InReg = FuncInfo.InitializeRegForValue(Inst);
+ std::optional<CallingConv::ID> CallConv;
+ auto *CI = dyn_cast<CallInst>(Inst);
+ if (CI && !CI->isInlineAsm())
+ CallConv = CI->getCallingConv();
+
RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
- Inst->getType(), std::nullopt);
+ Inst->getType(), CallConv);
SDValue Chain = DAG.getEntryNode();
return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
}
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 826e412..8358105 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -319,7 +319,7 @@ bool ShrinkWrapImpl::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS,
return isa<GlobalValue>(UO);
}
if (const PseudoSourceValue *PSV = Op->getPseudoValue())
- return PSV->isJumpTable();
+ return PSV->isJumpTable() || PSV->isConstantPool();
return false;
};
// Load/store operations may access the stack indirectly when we previously
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 7e5e7b5..b838e36 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5262,33 +5262,47 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
return;
}
+ auto GetMaybeAlign = [](Value *Op) {
+ if (auto *CI = dyn_cast<ConstantInt>(Op)) {
+ uint64_t Val = CI->getZExtValue();
+ if (Val == 0)
+ return MaybeAlign();
+ if (isPowerOf2_64(Val))
+ return MaybeAlign(Val);
+ }
+ reportFatalUsageError("Invalid alignment argument");
+ };
+ auto GetAlign = [&](Value *Op) {
+ MaybeAlign Align = GetMaybeAlign(Op);
+ if (Align)
+ return *Align;
+ reportFatalUsageError("Invalid zero alignment argument");
+ };
+
const DataLayout &DL = CI->getDataLayout();
switch (NewFn->getIntrinsicID()) {
case Intrinsic::masked_load:
NewCall = Builder.CreateMaskedLoad(
- CI->getType(), CI->getArgOperand(0),
- cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(),
+ CI->getType(), CI->getArgOperand(0), GetAlign(CI->getArgOperand(1)),
CI->getArgOperand(2), CI->getArgOperand(3));
break;
case Intrinsic::masked_gather:
NewCall = Builder.CreateMaskedGather(
CI->getType(), CI->getArgOperand(0),
- DL.getValueOrABITypeAlignment(
- cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue(),
- CI->getType()->getScalarType()),
+ DL.getValueOrABITypeAlignment(GetMaybeAlign(CI->getArgOperand(1)),
+ CI->getType()->getScalarType()),
CI->getArgOperand(2), CI->getArgOperand(3));
break;
case Intrinsic::masked_store:
NewCall = Builder.CreateMaskedStore(
CI->getArgOperand(0), CI->getArgOperand(1),
- cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(),
- CI->getArgOperand(3));
+ GetAlign(CI->getArgOperand(2)), CI->getArgOperand(3));
break;
case Intrinsic::masked_scatter:
NewCall = Builder.CreateMaskedScatter(
CI->getArgOperand(0), CI->getArgOperand(1),
DL.getValueOrABITypeAlignment(
- cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue(),
+ GetMaybeAlign(CI->getArgOperand(2)),
CI->getArgOperand(0)->getType()->getScalarType()),
CI->getArgOperand(3));
break;
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index a6353664..62fd62c 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -111,11 +111,13 @@ uint64_t ModuleSummaryIndex::getFlags() const {
Flags |= 0x100;
if (hasUnifiedLTO())
Flags |= 0x200;
+ if (withInternalizeAndPromote())
+ Flags |= 0x400;
return Flags;
}
void ModuleSummaryIndex::setFlags(uint64_t Flags) {
- assert(Flags <= 0x2ff && "Unexpected bits in flag");
+ assert(Flags <= 0x7ff && "Unexpected bits in flag");
// 1 bit: WithGlobalValueDeadStripping flag.
// Set on combined index only.
if (Flags & 0x1)
@@ -154,6 +156,10 @@ void ModuleSummaryIndex::setFlags(uint64_t Flags) {
// Set on combined index only.
if (Flags & 0x200)
setUnifiedLTO();
+ // 1 bit: WithInternalizeAndPromote flag.
+ // Set on combined index only.
+ if (Flags & 0x400)
+ setWithInternalizeAndPromote();
}
// Collect for the given module the list of function it defines
diff --git a/llvm/lib/IRReader/IRReader.cpp b/llvm/lib/IRReader/IRReader.cpp
index a7e7dee..c16871f 100644
--- a/llvm/lib/IRReader/IRReader.cpp
+++ b/llvm/lib/IRReader/IRReader.cpp
@@ -8,6 +8,7 @@
#include "llvm/IRReader/IRReader.h"
#include "llvm-c/IRReader.h"
+#include "llvm/AsmParser/AsmParserContext.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/LLVMContext.h"
@@ -68,7 +69,8 @@ std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
LLVMContext &Context,
- ParserCallbacks Callbacks) {
+ ParserCallbacks Callbacks,
+ llvm::AsmParserContext *ParserContext) {
NamedRegionTimer T(TimeIRParsingName, TimeIRParsingDescription,
TimeIRParsingGroupName, TimeIRParsingGroupDescription,
TimePassesIsEnabled);
@@ -88,12 +90,14 @@ std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
return parseAssembly(Buffer, Err, Context, nullptr,
Callbacks.DataLayout.value_or(
- [](StringRef, StringRef) { return std::nullopt; }));
+ [](StringRef, StringRef) { return std::nullopt; }),
+ ParserContext);
}
std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
LLVMContext &Context,
- ParserCallbacks Callbacks) {
+ ParserCallbacks Callbacks,
+ AsmParserContext *ParserContext) {
ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
MemoryBuffer::getFileOrSTDIN(Filename, /*IsText=*/true);
if (std::error_code EC = FileOrErr.getError()) {
@@ -102,7 +106,8 @@ std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
return nullptr;
}
- return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks);
+ return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks,
+ ParserContext);
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index cbc0b1d..72ae064 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -551,9 +551,11 @@ void llvm::thinLTOInternalizeAndPromoteInIndex(
function_ref<bool(StringRef, ValueInfo)> isExported,
function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing) {
+ assert(!Index.withInternalizeAndPromote());
for (auto &I : Index)
thinLTOInternalizeAndPromoteGUID(Index.getValueInfo(I), isExported,
isPrevailing);
+ Index.setWithInternalizeAndPromote();
}
// Requires a destructor for std::vector<InputModule>.
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index e45cac8..048c58d 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1095,6 +1095,31 @@ Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
return Result;
}
+Expected<AllocTokenOptions> parseAllocTokenPassOptions(StringRef Params) {
+ AllocTokenOptions Result;
+ while (!Params.empty()) {
+ StringRef ParamName;
+ std::tie(ParamName, Params) = Params.split(';');
+
+ if (ParamName.consume_front("mode=")) {
+ if (auto Mode = getAllocTokenModeFromString(ParamName))
+ Result.Mode = *Mode;
+ else
+ return make_error<StringError>(
+ formatv("invalid argument to AllocToken pass mode "
+ "parameter: '{}'",
+ ParamName)
+ .str(),
+ inconvertibleErrorCode());
+ } else {
+ return make_error<StringError>(
+ formatv("invalid AllocToken pass parameter '{}'", ParamName).str(),
+ inconvertibleErrorCode());
+ }
+ }
+ return Result;
+}
+
/// Parser of parameters for SimplifyCFG pass.
Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
SimplifyCFGOptions Result;
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 884d8da..a66b6e4 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -126,7 +126,6 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
MODULE_PASS("openmp-opt-postlink",
OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
MODULE_PASS("partial-inliner", PartialInlinerPass())
-MODULE_PASS("alloc-token", AllocTokenPass())
MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
@@ -183,6 +182,10 @@ MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
#endif
MODULE_PASS_WITH_PARAMS(
+ "alloc-token", "AllocTokenPass",
+ [](AllocTokenOptions Opts) { return AllocTokenPass(Opts); },
+ parseAllocTokenPassOptions, "mode=<mode>")
+MODULE_PASS_WITH_PARAMS(
"asan", "AddressSanitizerPass",
[](AddressSanitizerOptions Opts) { return AddressSanitizerPass(Opts); },
parseASanPassOptions, "kernel;use-after-scope")
diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp
new file mode 100644
index 0000000..8e9e89f
--- /dev/null
+++ b/llvm/lib/Support/AllocToken.cpp
@@ -0,0 +1,61 @@
+//===- AllocToken.cpp - Allocation Token Calculation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of AllocToken modes and shared calculation of stateless token IDs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/AllocToken.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SipHash.h"
+
+using namespace llvm;
+
+std::optional<AllocTokenMode>
+llvm::getAllocTokenModeFromString(StringRef Name) {
+ return StringSwitch<std::optional<AllocTokenMode>>(Name)
+ .Case("increment", AllocTokenMode::Increment)
+ .Case("random", AllocTokenMode::Random)
+ .Case("typehash", AllocTokenMode::TypeHash)
+ .Case("typehashpointersplit", AllocTokenMode::TypeHashPointerSplit)
+ .Default(std::nullopt);
+}
+
+static uint64_t getStableHash(const AllocTokenMetadata &Metadata,
+ uint64_t MaxTokens) {
+ return getStableSipHash(Metadata.TypeName) % MaxTokens;
+}
+
+std::optional<uint64_t> llvm::getAllocToken(AllocTokenMode Mode,
+ const AllocTokenMetadata &Metadata,
+ uint64_t MaxTokens) {
+ assert(MaxTokens && "Must provide non-zero max tokens");
+
+ switch (Mode) {
+ case AllocTokenMode::Increment:
+ case AllocTokenMode::Random:
+ // Stateful modes cannot be implemented as a pure function.
+ return std::nullopt;
+
+ case AllocTokenMode::TypeHash:
+ return getStableHash(Metadata, MaxTokens);
+
+ case AllocTokenMode::TypeHashPointerSplit: {
+ if (MaxTokens == 1)
+ return 0;
+ const uint64_t HalfTokens = MaxTokens / 2;
+ uint64_t Hash = getStableHash(Metadata, HalfTokens);
+ if (Metadata.ContainsPointer)
+ Hash += HalfTokens;
+ return Hash;
+ }
+ }
+
+ llvm_unreachable("");
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 42b21b5..671a5fe 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -149,6 +149,7 @@ add_llvm_component_library(LLVMSupport
AArch64BuildAttributes.cpp
ARMAttributeParser.cpp
ARMWinEH.cpp
+ AllocToken.cpp
Allocator.cpp
AutoConvert.cpp
Base64.cpp
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 31fcd63..5d9215d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -136,8 +136,8 @@ def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
(ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend))))),
(LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
- (am_indexed32 GPR64sp:$Rn, uimm12s8:$offset))))),
- (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>;
+ (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+ (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
(am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
(LDURSi GPR64sp:$Rn, simm9:$offset)>;
@@ -236,11 +236,11 @@ def : Pat<(relaxed_store<atomic_store_32>
def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
(STLRX GPR64:$val, GPR64sp:$ptr)>;
def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
- ro_Wextend16:$extend),
+ ro_Wextend64:$extend),
GPR64:$val),
(STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
- ro_Xextend16:$extend),
+ ro_Xextend64:$extend),
GPR64:$val),
(STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
def : Pat<(relaxed_store<atomic_store_64>
@@ -276,8 +276,8 @@ def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
(i64 (bitconvert (f64 FPR64Op:$val)))),
(STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
def : Pat<(relaxed_store<atomic_store_64>
- (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
- (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
+ (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(relaxed_store<atomic_store_64>
(am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
(STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index fe84193..30b7b03 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -507,7 +507,7 @@ let AddedComplexity = 19 in {
defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>;
}
-def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))),
+def : Pat<(v8i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
(LD1Rv8b GPR64sp:$Rn)>;
def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
(LD1Rv16b GPR64sp:$Rn)>;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index ef974df..47144c7 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -993,7 +993,7 @@ def PPR_3b : PPRClass<0, 7> { // Restricted 3 bit SVE predicate register class.
let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 0, 8>";
}
def PPR_p8to15 : PPRClass<8, 15> {
- let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PNRRegClassID, 8, 8>";
+ let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 8, 8>";
}
def PPRMul2 : PPRClass<0, 14, 2>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ff2d2f..d930a21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10628,6 +10628,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
return false;
+ const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+ this]() -> bool {
+ if (CmpValue != 0)
+ return false;
+
+ MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+ if (!Def || Def->getParent() != CmpInstr.getParent())
+ return false;
+
+ const auto foldableSelect = [](MachineInstr *Def) -> bool {
+ if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
+ bool Op1IsNonZeroImm =
+ Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
+ bool Op2IsZeroImm =
+ Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
+ if (Op1IsNonZeroImm && Op2IsZeroImm)
+ return true;
+ }
+ return false;
+ };
+
+ // For S_OP that set SCC = DST!=0, do the transformation
+ //
+ // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
+
+ // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
+ // for S_CSELECT* already has the same value that will be calculated by
+ // s_cmp_lg_*
+ //
+ // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
+ // imm), 0)
+ if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+ return false;
+
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+ return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
+ }
+
+ if (MachineOperand *SccDef =
+ Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+ SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+ CmpInstr.eraseFromParent();
+ return true;
+ };
+
const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
this](int64_t ExpectedValue, unsigned SrcSize,
bool IsReversible, bool IsSigned) -> bool {
@@ -10702,16 +10755,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
- I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
- I->killsRegister(AMDGPU::SCC, &RI))
+ MachineInstr *KillsSCC = nullptr;
+ for (MachineInstr &MI :
+ make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+ if (MI.modifiesRegister(AMDGPU::SCC, &RI))
return false;
+ if (MI.killsRegister(AMDGPU::SCC, &RI))
+ KillsSCC = &MI;
}
MachineOperand *SccDef =
Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
SccDef->setIsDead(false);
+ if (KillsSCC)
+ KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
CmpInstr.eraseFromParent();
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10755,7 +10812,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false);
+ return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10763,7 +10820,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false);
+ return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e1d7a07..5fdedda 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -714,6 +714,52 @@ public:
}
}
+ static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::S_ABSDIFF_I32:
+ case AMDGPU::S_ABS_I32:
+ case AMDGPU::S_AND_B32:
+ case AMDGPU::S_AND_B64:
+ case AMDGPU::S_ANDN2_B32:
+ case AMDGPU::S_ANDN2_B64:
+ case AMDGPU::S_ASHR_I32:
+ case AMDGPU::S_ASHR_I64:
+ case AMDGPU::S_BCNT0_I32_B32:
+ case AMDGPU::S_BCNT0_I32_B64:
+ case AMDGPU::S_BCNT1_I32_B32:
+ case AMDGPU::S_BCNT1_I32_B64:
+ case AMDGPU::S_BFE_I32:
+ case AMDGPU::S_BFE_I64:
+ case AMDGPU::S_BFE_U32:
+ case AMDGPU::S_BFE_U64:
+ case AMDGPU::S_LSHL_B32:
+ case AMDGPU::S_LSHL_B64:
+ case AMDGPU::S_LSHR_B32:
+ case AMDGPU::S_LSHR_B64:
+ case AMDGPU::S_NAND_B32:
+ case AMDGPU::S_NAND_B64:
+ case AMDGPU::S_NOR_B32:
+ case AMDGPU::S_NOR_B64:
+ case AMDGPU::S_NOT_B32:
+ case AMDGPU::S_NOT_B64:
+ case AMDGPU::S_OR_B32:
+ case AMDGPU::S_OR_B64:
+ case AMDGPU::S_ORN2_B32:
+ case AMDGPU::S_ORN2_B64:
+ case AMDGPU::S_QUADMASK_B32:
+ case AMDGPU::S_QUADMASK_B64:
+ case AMDGPU::S_WQM_B32:
+ case AMDGPU::S_WQM_B64:
+ case AMDGPU::S_XNOR_B32:
+ case AMDGPU::S_XNOR_B64:
+ case AMDGPU::S_XOR_B32:
+ case AMDGPU::S_XOR_B64:
+ return true;
+ default:
+ return false;
+ }
+ }
+
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 9b5fc9d..a652b7e 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -95,7 +95,24 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) {
return;
IsCompleted = true;
- BTFType.NameOff = BDebug.addString(Name);
+ switch (Kind) {
+ case BTF::BTF_KIND_PTR:
+ case BTF::BTF_KIND_CONST:
+ case BTF::BTF_KIND_VOLATILE:
+ case BTF::BTF_KIND_RESTRICT:
+ // Debug info might contain names for these types, but given that we want
+ // to keep BTF minimal and naming reference types doesn't bring any value
+ // (what matters is the completeness of the base type), we don't emit them.
+ //
+ // Furthermore, the Linux kernel refuses to load BPF programs that contain
+ // BTF with these types named:
+ // https://elixir.bootlin.com/linux/v6.17.1/source/kernel/bpf/btf.c#L2586
+ BTFType.NameOff = 0;
+ break;
+ default:
+ BTFType.NameOff = BDebug.addString(Name);
+ break;
+ }
if (NeedsFixup || !DTy)
return;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d477522..17f04d0 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14736,8 +14736,8 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
}
unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
- // Note: This functionality is used only when unsafe-fp-math is enabled, and
- // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+ // Note: This functionality is used only when arcp is enabled, and
+ // on cores with reciprocal estimates (which are used when arcp is
// enabled for division), this functionality is redundant with the default
// combiner logic (once the division -> reciprocal/multiply transformation
// has taken place). As a result, this matters more for older cores than for
diff --git a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
index f7fb886..3ca0b40 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
@@ -35,6 +35,7 @@
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsSPIRV.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/ReplaceConstant.h"
#define DEBUG_TYPE "spirv-cbuffer-access"
using namespace llvm;
@@ -57,6 +58,12 @@ static bool replaceCBufferAccesses(Module &M) {
if (!CBufMD)
return false;
+ SmallVector<Constant *> CBufferGlobals;
+ for (const hlsl::CBufferMapping &Mapping : *CBufMD)
+ for (const hlsl::CBufferMember &Member : Mapping.Members)
+ CBufferGlobals.push_back(Member.GV);
+ convertUsersOfConstantsToInstructions(CBufferGlobals);
+
for (const hlsl::CBufferMapping &Mapping : *CBufMD) {
Instruction *HandleDef = findHandleDef(Mapping.Handle);
if (!HandleDef) {
@@ -80,12 +87,7 @@ static bool replaceCBufferAccesses(Module &M) {
Value *GetPointerCall = Builder.CreateIntrinsic(
PtrType, Intrinsic::spv_resource_getpointer, {HandleDef, IndexVal});
- // We cannot use replaceAllUsesWith here because some uses may be
- // ConstantExprs, which cannot be replaced with non-constants.
- SmallVector<User *, 4> Users(MemberGV->users());
- for (User *U : Users) {
- U->replaceUsesOfWith(MemberGV, GetPointerCall);
- }
+ MemberGV->replaceAllUsesWith(GetPointerCall);
}
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f973949..7ec463b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -183,6 +183,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
for (auto T : {MVT::i32, MVT::i64})
setOperationAction(Op, T, Custom);
+ if (Subtarget->hasRelaxedSIMD()) {
+ setOperationAction(
+ {ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM},
+ {MVT::v4f32, MVT::v2f64}, Legal);
+ }
// SIMD-specific configuration
if (Subtarget->hasSIMD128()) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 7840620..f0ac26b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1742,6 +1742,23 @@ defm SIMD_RELAXED_FMIN :
defm SIMD_RELAXED_FMAX :
RelaxedBinary<F64x2, int_wasm_relaxed_max, "relaxed_max", 0x110>;
+let Predicates = [HasRelaxedSIMD] in {
+ foreach vec = [F32x4, F64x2] in {
+ defvar relaxed_min = !cast<NI>("SIMD_RELAXED_FMIN_"#vec);
+ defvar relaxed_max = !cast<NI>("SIMD_RELAXED_FMAX_"#vec);
+
+ // Transform standard fminimum/fmaximum to relaxed versions
+ def : Pat<(vec.vt (fminnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_min V128:$lhs, V128:$rhs)>;
+ def : Pat<(vec.vt (fminimumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_min V128:$lhs, V128:$rhs)>;
+ def : Pat<(vec.vt (fmaxnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_max V128:$lhs, V128:$rhs)>;
+ def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+ (relaxed_max V128:$lhs, V128:$rhs)>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// Relaxed rounding q15 multiplication
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b54a1e7..d49f25a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -20558,7 +20558,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// NOTE: By using fsub of a positive constant instead of fadd of a negative
- // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
+ // constant, we avoid reassociation in MachineCombiner when reassoc is
// enabled. See PR24512.
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
// TODO: Are there any fast-math-flags to propagate here?
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 83bd6ac..1b748b7 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5519,7 +5519,7 @@ defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs,
defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs,
SchedWriteFCmpSizes, 0>;
-// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// MIN/MAX nodes are commutable under (nnan + ninf). In this case we use
// X86fminc and X86fmaxc instead of X86fmin and X86fmax
multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, SDNode OpNode,
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index cc30054..ac4d31d 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -15,7 +15,7 @@
//===----------------------------------------------------------------------===//
def Znver4Model : SchedMachineModel {
- // AMD SOG Zen4, 2.9.6 Dispatch
+ // AMD SOG Zen4, 2.9.8 Dispatch
// The processor may dispatch up to 6 macro ops per cycle
// into the execution engine.
let IssueWidth = 6;
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
int VecLoadLatency = 7;
// Latency of a simple store operation.
int StoreLatency = 1;
- // FIXME:
- let HighLatency = 25; // FIXME: any better choice?
+ // Mean and median value for all instructions with latencies >6
+ // Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
+ let HighLatency = 13;
// AMD SOG Zen4, 2.8 Optimizing Branching
// The branch misprediction penalty is in the range from 11 to 18 cycles,
// <...>. The common case penalty is 13 cycles.
@@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[
def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+// values from uops.info
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 2; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [4];
@@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 3; // FIXME: not from llvm-exegesis
- let ReleaseAtCycles = [24];
- let NumMicroOps = 19;
+ let ReleaseAtCycles = [20];
+ let NumMicroOps = 15;
}
def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 4; // FIXME: not from llvm-exegesis
- let ReleaseAtCycles = [59];
- let NumMicroOps = 28;
+ let Latency = 2; // FIXME: not from llvm-exegesis
+ let ReleaseAtCycles = [40];
+ let NumMicroOps = 26;
}
def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
@@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = 5;
+ let NumMicroOps = 2;
}
def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
@@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
// Integer division.
-// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
-// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
-defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
-
-defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
-defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
+defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
+
+defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
+defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
@@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
}
def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
-defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
+defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.
def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
- let Latency = 2;
- let ReleaseAtCycles = [4];
- let NumMicroOps = 2;
+ let Latency = 1;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
}
def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
-def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
- // TODO: All align instructions are expected to be of 4 cycle latency
- let Latency = 4;
+// 128-bit VALIGN
+def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+ let Latency = 2;
let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}
-def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
- VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
- >;
+
+// 256-bit VALIGN
+def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+ let Latency = 3;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
+}
+
+// 512-bit VALIGN
+def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+ let Latency = 4;
+ let ReleaseAtCycles = [2];
+ let NumMicroOps = 1;
+}
+
+def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
+def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
+def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
+
defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
@@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
// Packed Compare Explicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
// Packed Compare Implicit Length Strings, Return Index
defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
// Packed Compare Explicit Length Strings, Return Index
@@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
// Carry-less multiplication instructions.
-defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
+defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;
// EMMS/FEMMS
defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 7;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
+ let ReleaseAtCycles = [1, 1, 1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 6;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
+ let ReleaseAtCycles = [1, 1, 1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
- let Latency = 5;
+ let Latency = 4;
let ReleaseAtCycles = [1];
- let NumMicroOps = 2;
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
- let ReleaseAtCycles = [1, 1, 2];
- let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
+ let ReleaseAtCycles = [1, 1, 1];
+ let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index bbbac45..7a95df4 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -907,10 +907,20 @@ static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
StoreInst *Store = Builder.CreateAlignedStore(
Val, First.Store->getPointerOperand(), First.Store->getAlign());
+ // Merge various metadata onto the new store.
AAMDNodes AATags = First.Store->getAAMetadata();
- for (const PartStore &Part : drop_begin(Parts))
+ SmallVector<Instruction *> Stores = {First.Store};
+ Stores.reserve(Parts.size());
+ SmallVector<DebugLoc> DbgLocs = {First.Store->getDebugLoc()};
+ DbgLocs.reserve(Parts.size());
+ for (const PartStore &Part : drop_begin(Parts)) {
AATags = AATags.concat(Part.Store->getAAMetadata());
+ Stores.push_back(Part.Store);
+ DbgLocs.push_back(Part.Store->getDebugLoc());
+ }
Store->setAAMetadata(AATags);
+ Store->mergeDIAssignID(Stores);
+ Store->setDebugLoc(DebugLoc::getMergedLocations(DbgLocs));
// Remove the old stores.
for (const PartStore &Part : Parts)
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 76e588b..a0f7ec6 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -24,7 +24,8 @@
// returns 0, or a single vtable's function returns 1, replace each virtual
// call with a comparison of the vptr against that vtable's address.
//
-// This pass is intended to be used during the regular and thin LTO pipelines:
+// This pass is intended to be used during the regular/thin and non-LTO
+// pipelines:
//
// During regular LTO, the pass determines the best optimization for each
// virtual call and applies the resolutions directly to virtual calls that are
@@ -48,6 +49,14 @@
// is supported.
// - Import phase: (same as with hybrid case above).
//
+// During Speculative devirtualization mode -not restricted to LTO-:
+// - The pass applies speculative devirtualization without requiring any type of
+// visibility.
+// - Skips other features like virtual constant propagation, uniform return
+// value optimization, unique return value optimization and branch funnels as
+// they need LTO.
+// - This mode is enabled via 'devirtualize-speculatively' flag.
+//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -61,7 +70,9 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/TypeMetadataUtils.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Bitcode/BitcodeWriter.h"
@@ -145,6 +156,13 @@ static cl::opt<std::string> ClWriteSummary(
"bitcode, otherwise YAML"),
cl::Hidden);
+// TODO: This option eventually should support any public visibility vtables
+// with/out LTO.
+static cl::opt<bool> ClDevirtualizeSpeculatively(
+ "devirtualize-speculatively",
+ cl::desc("Enable speculative devirtualization optimization"),
+ cl::init(false));
+
static cl::opt<unsigned>
ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
cl::init(10),
@@ -892,6 +910,8 @@ void llvm::updatePublicTypeTestCalls(Module &M,
CI->eraseFromParent();
}
} else {
+ // TODO: Don't replace public type tests when speculative devirtualization
+ // gets enabled in LTO mode.
auto *True = ConstantInt::getTrue(M.getContext());
for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) {
auto *CI = cast<CallInst>(U.getUser());
@@ -1083,10 +1103,10 @@ bool DevirtModule::tryFindVirtualCallTargets(
if (!TM.Bits->GV->isConstant())
return false;
- // We cannot perform whole program devirtualization analysis on a vtable
- // with public LTO visibility.
- if (TM.Bits->GV->getVCallVisibility() ==
- GlobalObject::VCallVisibilityPublic)
+ // Without ClDevirtualizeSpeculatively, we cannot perform whole program
+ // devirtualization analysis on a vtable with public LTO visibility.
+ if (!ClDevirtualizeSpeculatively && TM.Bits->GV->getVCallVisibility() ==
+ GlobalObject::VCallVisibilityPublic)
return false;
Function *Fn = nullptr;
@@ -1105,6 +1125,12 @@ bool DevirtModule::tryFindVirtualCallTargets(
if (Fn->getName() == "__cxa_pure_virtual")
continue;
+ // In most cases empty functions will be overridden by the
+ // implementation of the derived class, so we can skip them.
+ if (ClDevirtualizeSpeculatively && Fn->getReturnType()->isVoidTy() &&
+ Fn->getInstructionCount() <= 1)
+ continue;
+
// We can disregard unreachable functions as possible call targets, as
// unreachable functions shouldn't be called.
if (mustBeUnreachableFunction(Fn, ExportSummary))
@@ -1223,10 +1249,12 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
CallTrap->setDebugLoc(CB.getDebugLoc());
}
- // If fallback checking is enabled, add support to compare the virtual
- // function pointer to the devirtualized target. In case of a mismatch,
- // fall back to indirect call.
- if (DevirtCheckMode == WPDCheckMode::Fallback) {
+ // If fallback checking or speculative devirtualization are enabled,
+ // add support to compare the virtual function pointer to the
+ // devirtualized target. In case of a mismatch, fall back to indirect
+ // call.
+ if (DevirtCheckMode == WPDCheckMode::Fallback ||
+ ClDevirtualizeSpeculatively) {
MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights();
// Version the indirect call site. If the called value is equal to the
// given callee, 'NewInst' will be executed, otherwise the original call
@@ -2057,15 +2085,15 @@ void DevirtModule::scanTypeTestUsers(
Function *TypeTestFunc,
DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
// Find all virtual calls via a virtual table pointer %p under an assumption
- // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
- // points to a member of the type identifier %md. Group calls by (type ID,
- // offset) pair (effectively the identity of the virtual function) and store
- // to CallSlots.
+ // of the form llvm.assume(llvm.type.test(%p, %md)) or
+ // llvm.assume(llvm.public.type.test(%p, %md)).
+ // This indicates that %p points to a member of the type identifier %md.
+ // Group calls by (type ID, offset) pair (effectively the identity of the
+ // virtual function) and store to CallSlots.
for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
auto *CI = dyn_cast<CallInst>(U.getUser());
if (!CI)
continue;
-
// Search for virtual calls based on %p and add them to DevirtCalls.
SmallVector<DevirtCallSite, 1> DevirtCalls;
SmallVector<CallInst *, 1> Assumes;
@@ -2348,6 +2376,12 @@ bool DevirtModule::run() {
(ImportSummary && ImportSummary->partiallySplitLTOUnits()))
return false;
+ Function *PublicTypeTestFunc = nullptr;
+ // If we are in speculative devirtualization mode, we can work on the public
+ // type test intrinsics.
+ if (ClDevirtualizeSpeculatively)
+ PublicTypeTestFunc =
+ Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
Function *TypeTestFunc =
Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
Function *TypeCheckedLoadFunc =
@@ -2361,8 +2395,9 @@ bool DevirtModule::run() {
// module, this pass has nothing to do. But if we are exporting, we also need
// to handle any users that appear only in the function summaries.
if (!ExportSummary &&
- (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
- AssumeFunc->use_empty()) &&
+ (((!PublicTypeTestFunc || PublicTypeTestFunc->use_empty()) &&
+ (!TypeTestFunc || TypeTestFunc->use_empty())) ||
+ !AssumeFunc || AssumeFunc->use_empty()) &&
(!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) &&
(!TypeCheckedLoadRelativeFunc ||
TypeCheckedLoadRelativeFunc->use_empty()))
@@ -2373,6 +2408,9 @@ bool DevirtModule::run() {
DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
buildTypeIdentifierMap(Bits, TypeIdMap);
+ if (PublicTypeTestFunc && AssumeFunc)
+ scanTypeTestUsers(PublicTypeTestFunc, TypeIdMap);
+
if (TypeTestFunc && AssumeFunc)
scanTypeTestUsers(TypeTestFunc, TypeIdMap);
@@ -2472,8 +2510,12 @@ bool DevirtModule::run() {
.WPDRes[S.first.ByteOffset];
if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
S.first.ByteOffset, ExportSummary)) {
-
- if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
+ bool SingleImplDevirt =
+ trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res);
+ // Out of speculative devirtualization mode, Try to apply virtual constant
+ // propagation or branch funneling.
+ // TODO: This should eventually be enabled for non-public type tests.
+ if (!SingleImplDevirt && !ClDevirtualizeSpeculatively) {
DidVirtualConstProp |=
tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
index 29968b8..8181e4e 100644
--- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -36,6 +36,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/AllocToken.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
@@ -54,47 +55,14 @@
#include <variant>
using namespace llvm;
+using TokenMode = AllocTokenMode;
#define DEBUG_TYPE "alloc-token"
namespace {
-//===--- Constants --------------------------------------------------------===//
-
-enum class TokenMode : unsigned {
- /// Incrementally increasing token ID.
- Increment = 0,
-
- /// Simple mode that returns a statically-assigned random token ID.
- Random = 1,
-
- /// Token ID based on allocated type hash.
- TypeHash = 2,
-
- /// Token ID based on allocated type hash, where the top half ID-space is
- /// reserved for types that contain pointers and the bottom half for types
- /// that do not contain pointers.
- TypeHashPointerSplit = 3,
-};
-
//===--- Command-line options ---------------------------------------------===//
-cl::opt<TokenMode> ClMode(
- "alloc-token-mode", cl::Hidden, cl::desc("Token assignment mode"),
- cl::init(TokenMode::TypeHashPointerSplit),
- cl::values(
- clEnumValN(TokenMode::Increment, "increment",
- "Incrementally increasing token ID"),
- clEnumValN(TokenMode::Random, "random",
- "Statically-assigned random token ID"),
- clEnumValN(TokenMode::TypeHash, "typehash",
- "Token ID based on allocated type hash"),
- clEnumValN(
- TokenMode::TypeHashPointerSplit, "typehashpointersplit",
- "Token ID based on allocated type hash, where the top half "
- "ID-space is reserved for types that contain pointers and the "
- "bottom half for types that do not contain pointers. ")));
-
cl::opt<std::string> ClFuncPrefix("alloc-token-prefix",
cl::desc("The allocation function prefix"),
cl::Hidden, cl::init("__alloc_token_"));
@@ -217,22 +185,19 @@ public:
using ModeBase::ModeBase;
uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
- const auto [N, H] = getHash(CB, ORE);
- return N ? boundedToken(H) : H;
- }
-protected:
- std::pair<MDNode *, uint64_t> getHash(const CallBase &CB,
- OptimizationRemarkEmitter &ORE) {
if (MDNode *N = getAllocTokenMetadata(CB)) {
MDString *S = cast<MDString>(N->getOperand(0));
- return {N, getStableSipHash(S->getString())};
+ AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+ if (auto Token = getAllocToken(TokenMode::TypeHash, Metadata, MaxTokens))
+ return *Token;
}
// Fallback.
remarkNoMetadata(CB, ORE);
- return {nullptr, ClFallbackToken};
+ return ClFallbackToken;
}
+protected:
/// Remark that there was no precise type information.
static void remarkNoMetadata(const CallBase &CB,
OptimizationRemarkEmitter &ORE) {
@@ -253,20 +218,18 @@ public:
using TypeHashMode::TypeHashMode;
uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
- if (MaxTokens == 1)
- return 0;
- const uint64_t HalfTokens = MaxTokens / 2;
- const auto [N, H] = getHash(CB, ORE);
- if (!N) {
- // Pick the fallback token (ClFallbackToken), which by default is 0,
- // meaning it'll fall into the pointer-less bucket. Override by setting
- // -alloc-token-fallback if that is the wrong choice.
- return H;
+ if (MDNode *N = getAllocTokenMetadata(CB)) {
+ MDString *S = cast<MDString>(N->getOperand(0));
+ AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+ if (auto Token = getAllocToken(TokenMode::TypeHashPointerSplit, Metadata,
+ MaxTokens))
+ return *Token;
}
- uint64_t Hash = H % HalfTokens; // base hash
- if (containsPointer(N))
- Hash += HalfTokens;
- return Hash;
+ // Pick the fallback token (ClFallbackToken), which by default is 0, meaning
+ // it'll fall into the pointer-less bucket. Override by setting
+ // -alloc-token-fallback if that is the wrong choice.
+ remarkNoMetadata(CB, ORE);
+ return ClFallbackToken;
}
};
@@ -286,7 +249,7 @@ public:
: Options(transformOptionsFromCl(std::move(Opts))), Mod(M),
FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
Mode(IncrementMode(*IntPtrTy, *Options.MaxTokens)) {
- switch (ClMode.getValue()) {
+ switch (Options.Mode) {
case TokenMode::Increment:
break;
case TokenMode::Random:
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 8714741a..9829d4d 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1793,3 +1793,13 @@ bool llvm::hasOnlySimpleTerminator(const Function &F) {
}
return true;
}
+
+Printable llvm::printBasicBlock(const BasicBlock *BB) {
+ return Printable([BB](raw_ostream &OS) {
+ if (!BB) {
+ OS << "<nullptr>";
+ return;
+ }
+ BB->printAsOperand(OS);
+ });
+}
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 978d5a2..371d9e6 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -260,9 +260,16 @@ bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
// next to the defs they must go with so that we can know it's time to pop
// the stack when we hit the end of the phi uses for a given def.
const ValueDFS &Top = *Stack.back().V;
- if (Top.LocalNum == LN_Last && Top.PInfo) {
- if (!VDUse.U)
- return false;
+ assert(Top.PInfo && "RenameStack should only contain predicate infos (defs)");
+ if (Top.LocalNum == LN_Last) {
+ if (!VDUse.U) {
+ assert(VDUse.PInfo && "A non-use VDUse should have a predicate info");
+ // We should reserve adjacent LN_Last defs for the same phi use.
+ return VDUse.LocalNum == LN_Last &&
+ // If the two phi defs have the same edge, they must be designated
+ // for the same succ BB.
+ getBlockEdge(Top.PInfo) == getBlockEdge(VDUse.PInfo);
+ }
auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
if (!PHI)
return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index adf27be..d2c100c9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9860,6 +9860,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
unsigned UserIC = Hints.getInterleave();
+ if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
+ UserIC = 1;
// Plan how to best vectorize.
LVP.plan(UserVF, UserIC);
@@ -9924,7 +9926,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VectorizeLoop = false;
}
- if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
+ if (UserIC == 1 && Hints.getInterleave() > 1) {
+ assert(!LVL.isSafeForAnyVectorWidth() &&
+ "UserIC should only be ignored due to unsafe dependencies");
+ LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
+ IntDiagMsg = {"InterleavingUnsafe",
+ "Ignoring user-specified interleave count due to possibly "
+ "unsafe dependencies in the loop."};
+ InterleaveLoop = false;
+ } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
// Tell the user interleaving was avoided up-front, despite being explicitly
// requested.
LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ff25ef5..48cf763 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4051,7 +4051,7 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
static std::optional<ElementCount> isConsecutiveInterleaveGroup(
VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
- if (!InterleaveR)
+ if (!InterleaveR || InterleaveR->getMask())
return std::nullopt;
Type *GroupElementTy = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 0678bc90..83e3fca 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -41,10 +41,10 @@ class VPRecipeBase;
class VPInterleaveBase;
class VPPhiAccessors;
-// This is the base class of the VPlan Def/Use graph, used for modeling the data
-// flow into, within and out of the VPlan. VPValues can stand for live-ins
-// coming from the input IR and instructions which VPlan will generate if
-// executed.
+/// This is the base class of the VPlan Def/Use graph, used for modeling the
+/// data flow into, within and out of the VPlan. VPValues can stand for live-ins
+/// coming from the input IR and instructions which VPlan will generate if
+/// executed.
class LLVM_ABI_FOR_TEST VPValue {
friend class VPDef;
friend struct VPDoubleValueDef;
@@ -57,7 +57,7 @@ class LLVM_ABI_FOR_TEST VPValue {
SmallVector<VPUser *, 1> Users;
protected:
- // Hold the underlying Value, if any, attached to this VPValue.
+ /// Hold the underlying Value, if any, attached to this VPValue.
Value *UnderlyingVal;
/// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
diff --git a/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
new file mode 100644
index 0000000..1de8ab5
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; BasicAA should prove that loads from sufficiently large static offsets
+; don't overlap with matrix loads with a statically known size.
+
+define <8 x double> @non_overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: non_overlapping_strided_load:
+; Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+entry:
+ %src.offset = getelementptr inbounds double, ptr %src, i32 12
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+ ret <8 x double> %l
+}
+
+define <8 x double> @overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: overlapping_strided_load:
+; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+;
+entry:
+ %src.offset = getelementptr inbounds double, ptr %src, i32 11
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+ ret <8 x double> %l
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
new file mode 100644
index 0000000..458bd2e
--- /dev/null
+++ b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
@@ -0,0 +1,49 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/masked-store.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE
+; RUN: not llvm-as < %t/masked-store-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE-ZERO
+; RUN: not llvm-as < %t/masked-load.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD
+; RUN: not llvm-as < %t/masked-load-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD-ZERO
+; RUN: not llvm-as < %t/masked-scatter.ll 2>&1 | FileCheck %s --check-prefix=MASKED-SCATTER
+; RUN: not llvm-as < %t/masked-gather.ll 2>&1 | FileCheck %s --check-prefix=MASKED-GATHER
+
+;--- masked-store.ll
+; MASKED-STORE: LLVM ERROR: Invalid alignment argument
+define void @masked_store(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+ call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 3, <2 x i1> %mask)
+ ret void
+}
+
+;--- masked-store-zero.ll
+; MASKED-STORE-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_store_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+ call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 0, <2 x i1> %mask)
+ ret void
+}
+
+;--- masked-load.ll
+; MASKED-LOAD: LLVM ERROR: Invalid alignment argument
+define void @masked_load(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+ call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+ ret void
+}
+
+;--- masked-load-zero.ll
+; MASKED-LOAD-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_load_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+ call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 0, <2 x i1> %mask, <2 x double> %val)
+ ret void
+}
+
+;--- masked-scatter.ll
+; MASKED-SCATTER: LLVM ERROR: Invalid alignment argument
+define void @masked_scatter(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+ call void @llvm.masked.scatter.v2f64.p0(<2 x double> %val, <2 x ptr> %ptr, i32 3, <2 x i1> %mask)
+ ret void
+}
+
+;--- masked-gather.ll
+; MASKED-GATHER: LLVM ERROR: Invalid alignment argument
+define void @masked_gather(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+ call <2 x double> @llvm.masked.gather.v2f64.p0(<2 x ptr> %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+ ret void
+}
diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
deleted file mode 100644
index 00c0131..0000000
--- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; REQUIRES: x86-registered-target
-; RUN: opt -module-summary %s -o %t.o
-
-; Ensure dead stripping performed flag is set on distributed index
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN: -r %t.o,glob,plx
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD
-; WITHDEAD: <FLAGS op0=97/>
-
-; Ensure dead stripping performed flag is not set on distributed index
-; when option used to disable dead stripping computation.
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN: -r %t.o,glob,plx -compute-dead=false
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
-; NODEAD: <FLAGS op0=96/>
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@glob = global i32 0
diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll
new file mode 100644
index 0000000..e957ce6
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-index-flags.ll
@@ -0,0 +1,39 @@
+; REQUIRES: x86-registered-target
+; RUN: opt -module-summary %s -o %t.o
+
+;; By default, the indexing step should perform and set the appropriate index
+;; flags for dead stripping, attribute propagation, DSO local propagation,
+;; and internalization/promotion.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN: -r %t.o,glob,plx
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL
+;; The flag value should be 0x461 aka 1121:
+;; 0x1: Dead stripping
+;; 0x20: Attribute propagation
+;; 0x40: DSO local propagation
+;; 0x400: Internalization/promotion
+; ALL: <FLAGS op0=1121/>
+
+;; Ensure dead stripping performed flag is not set on distributed index
+;; when option used to disable dead stripping computation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN: -r %t.o,glob,plx -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
+;; Flag should be 0x460 aka 1120.
+; NODEAD: <FLAGS op0=1120/>
+
+;; Disabling attribute propagation should disable that as well as DSO local
+;; propagation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN: -r %t.o,glob,plx -propagate-attrs=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP
+;; Flag should be 0x401 aka 1025.
+; NOPROP: <FLAGS op0=1025/>
+
+;; Note there isn't currently a way to disable internalization+promotion, which
+;; are performed together.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glob = global i32 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
index 97a0417..b040ff2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
@@ -56,7 +56,7 @@
}
- attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
attributes #2 = { optsize }
attributes #3 = { minsize }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
index fc4fbac..f24aeae 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
@@ -47,7 +47,7 @@
ret void
}
- attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index b06cadf..e4d2ca3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -50,7 +50,7 @@
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
...
diff --git a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index 0c1776e..6e3682a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -37,7 +37,7 @@ for.body: ; preds = %for.body, %entry
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
!llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index f2ed57e..353e818 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -325,7 +325,7 @@ entry:
declare void @hhh(double, double)
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 7e97116..8da0e11 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -694,8 +694,8 @@ bb1:
; CHECK: .[[LABEL]]:
; CHECK: ret
-attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!1 = !{!2, !2, i64 0}
!2 = !{!"int", !3, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
index 296435a..937bfe4 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
@@ -519,8 +519,8 @@ while.cond:
br label %while.cond
}
-attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
index 45fa2be5..c05d661 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
@@ -79,8 +79,8 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #3
- attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+ attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #2 = { nounwind readnone speculatable }
attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
index 4e86f52..071344d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
@@ -47,6 +47,6 @@ declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) #1
; Function Attrs: nounwind readnone
declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) #1
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
index 9b3d539..0ddcdcc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
@@ -8,5 +8,5 @@ define float @mul_add(float %a, float %b, float %c) local_unnamed_addr #0 {
ret float %add
}
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
index d2ce7e6..41f57bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -84,7 +84,7 @@ bb3: ; preds = %bb3, %bb
; Function Attrs: nounwind readnone
declare i64 @llvm.objectsize.i64.p0(ptr, i1) #1
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
!1 = !{!2, !2, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 0b22fa4..c2b2c1e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1654,24 +1654,14 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) {
}
define <8 x i8> @dup_ld1_from_stack(ptr %__ret) {
-; CHECK-SD-LABEL: dup_ld1_from_stack:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: sub sp, sp, #16
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: add x8, sp, #15
-; CHECK-SD-NEXT: ld1r.8b { v0 }, [x8]
-; CHECK-SD-NEXT: add sp, sp, #16
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: dup_ld1_from_stack:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT: .cfi_offset w29, -16
-; CHECK-GI-NEXT: add x8, sp, #15
-; CHECK-GI-NEXT: ld1r.8b { v0 }, [x8]
-; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: dup_ld1_from_stack:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: add x8, sp, #15
+; CHECK-NEXT: ld1r.8b { v0 }, [x8]
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
entry:
%item = alloca i8, align 1
%0 = load i8, ptr %item, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 4cdc6cc..c6cf240 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -107,7 +107,7 @@ define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
; Function Attrs: nounwind
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
index 82b34ef..bb1a6b0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -108,5 +108,5 @@ for.end: ; preds = %for.cond
; Function Attrs: nounwind
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-rounding.ll b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
index d487aab..3ce35bf 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
@@ -201,4 +201,4 @@ entry:
}
attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
index db65fdd..1486b3a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
@@ -36,6 +36,6 @@ for.end705.i: ; preds = %for.body453.i
declare void @f() local_unnamed_addr #1
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
index fc59350..593d629 100644
--- a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
@@ -18,7 +18,7 @@ entry:
ret i32 %1
}
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
index 2e3b99f..c4bf7d2 100644
--- a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
+++ b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
@@ -61,4 +61,4 @@ declare dso_local void @e(...) local_unnamed_addr #0
declare dso_local i64 @llvm.aarch64.space(i32, i64) local_unnamed_addr #0
-attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
index 031ee35..7d2aaec 100644
--- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll
+++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
@@ -108,7 +108,7 @@ bb19: ; preds = %bb3, %bb
ret void
}
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 61df396..e561481 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -32,5 +32,5 @@ main_:
declare i32 @printf(ptr, ...) #1
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 1a83930..9193025 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -2,8 +2,8 @@
; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
; load zero-extended i32, bitcast to f64
-define double @_Z9load_u64_from_u32_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u32_testPj:
+define double @load_u64_from_u32(ptr %n){
+; CHECK-LABEL: load_u64_from_u32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ret
@@ -15,8 +15,8 @@ entry:
}
; load zero-extended i16, bitcast to f64
-define double @_Z9load_u64_from_u16_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u16_testPj:
+define double @load_u64_from_u16(ptr %n){
+; CHECK-LABEL: load_u64_from_u16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
@@ -28,8 +28,8 @@ entry:
}
; load zero-extended i8, bitcast to f64
-define double @_Z16load_u64_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u64_from_u8Ph:
+define double @load_u64_from_u8(ptr %n){
+; CHECK-LABEL: load_u64_from_u8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: ret
@@ -41,8 +41,8 @@ entry:
}
; load zero-extended i16, bitcast to f32
-define float @_Z17load_u32_from_u16Pt(ptr %n){
-; CHECK-LABEL: _Z17load_u32_from_u16Pt:
+define float @load_u32_from_u16(ptr %n){
+; CHECK-LABEL: load_u32_from_u16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: ret
@@ -54,8 +54,8 @@ entry:
}
; load zero-extended i8, bitcast to f32
-define float @_Z16load_u32_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u32_from_u8Ph:
+define float @load_u32_from_u8(ptr %n){
+; CHECK-LABEL: load_u32_from_u8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: ret
@@ -67,8 +67,8 @@ entry:
}
; load zero-extended i8, bitcast to f16
-define half @_Z16load_u16_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u16_from_u8Ph:
+define half @load_u16_from_u8(ptr %n){
+; CHECK-LABEL: load_u16_from_u8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
@@ -80,3 +80,504 @@ entry:
ret half %1
}
+
+define double @load_u64_from_u32_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldur w8, [x0, #1]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 1
+ %0 = load i32, ptr %p, align 4
+ %conv = zext i32 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldurh w8, [x0, #1]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 1
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrb w8, [x0, #1]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 1
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define float @load_u32_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldurh w8, [x0, #1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 1
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define float @load_u32_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrb w8, [x0, #1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 1
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define half @load_u16_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrb w8, [x0, #1]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 1
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i16
+ %1 = bitcast i16 %conv to half
+ ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldur w8, [x0, #2]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 2
+ %0 = load i32, ptr %p, align 4
+ %conv = zext i32 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrh w8, [x0, #2]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 2
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrb w8, [x0, #2]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 2
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define float @load_u32_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr h0, [x0, #2]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 2
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define float @load_u32_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 2
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define half @load_u16_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off2:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 2
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i16
+ %1 = bitcast i16 %conv to half
+ ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off255:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldur w8, [x0, #255]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 255
+ %0 = load i32, ptr %p, align 4
+ %conv = zext i32 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off255:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldurh w8, [x0, #255]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 255
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off255:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrb w8, [x0, #255]
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 255
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define float @load_u32_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off255:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldurh w8, [x0, #255]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 255
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define float @load_u32_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off255:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrb w8, [x0, #255]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 255
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define half @load_u16_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off255:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldrb w8, [x0, #255]
+; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 255
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i16
+ %1 = bitcast i16 %conv to half
+ ret half %1
+}
+
+
+define double @load_u64_from_u32_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off256:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr s0, [x0, #256]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 256
+ %0 = load i32, ptr %p, align 4
+ %conv = zext i32 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off256:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr h0, [x0, #128]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 256
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off256:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #64]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 256
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define float @load_u32_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off256:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr h0, [x0, #256]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 256
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define float @load_u32_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off256:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 256
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define half @load_u16_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off256:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 256
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i16
+ %1 = bitcast i16 %conv to half
+ ret half %1
+}
+
+
+
+define double @load_u64_from_u32_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offn:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr s0, [x0, #16380]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 16380
+ %0 = load i32, ptr %p, align 4
+ %conv = zext i32 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offn:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w8, #8190 // =0x1ffe
+; CHECK-NEXT: ldr h0, [x0, x8]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 8190
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offn:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #4095]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 4095
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define float @load_u32_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offn:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr h0, [x0, #8190]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 8190
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define float @load_u32_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offn:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #4095]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 4095
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define half @load_u16_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offn:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #4095]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 4095
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i16
+ %1 = bitcast i16 %conv to half
+ ret half %1
+}
+
+
+define double @load_u64_from_u32_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offnp1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, #4, lsl #12 // =16384
+; CHECK-NEXT: ldr s0, [x8]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 16384
+ %0 = load i32, ptr %p, align 4
+ %conv = zext i32 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offnp1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr h0, [x0, #4096]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 8192
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define double @load_u64_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offnp1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #1024]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 4096
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i64
+ %1 = bitcast i64 %conv to double
+ ret double %1
+}
+
+define float @load_u32_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offnp1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192
+; CHECK-NEXT: ldr h0, [x8]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 8192
+ %0 = load i16, ptr %p, align 2
+ %conv = zext i16 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define float @load_u32_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offnp1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 4096
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i32
+ %1 = bitcast i32 %conv to float
+ ret float %1
+}
+
+define half @load_u16_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offnp1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT: ret
+entry:
+ %p = getelementptr i8, ptr %n, i64 4096
+ %0 = load i8, ptr %p, align 1
+ %conv = zext i8 %0 to i16
+ %1 = bitcast i16 %conv to half
+ ret half %1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
index c2ef2fa..00a8c30 100644
--- a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
+++ b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
@@ -74,7 +74,7 @@ for.body: ; preds = %for.body.preheader,
br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !10
}
-attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/AArch64/recp-fastmath.ll b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
index 9f00621..fa1da33 100644
--- a/llvm/test/CodeGen/AArch64/recp-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
@@ -164,5 +164,5 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 {
; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
}
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "reciprocal-estimates"="div,vec-div" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "reciprocal-estimates"="div,vec-div" }
diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
new file mode 100644
index 0000000..6f33a75
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
@@ -0,0 +1,76 @@
+# RUN: llc -mtriple=aarch64 -simplify-mir -run-pass=shrink-wrap -o - %s | FileCheck %s
+--- |
+ declare double @foo()
+
+ define double @shrink_wrap_load_from_const_pool(double %q) {
+ entry:
+ %0 = fcmp oeq double %q, 3.125500e+02
+ br i1 %0, label %common.ret, label %if.else
+
+ common.ret: ; preds = %if.else, %entry, %exit1
+ %common.ret.op = phi double [ %3, %exit1 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.else ]
+ ret double %common.ret.op
+
+ if.else: ; preds = %entry
+ %1 = call double @foo()
+ %2 = fcmp oeq double %1, 0.000000e+00
+ br i1 %2, label %exit1, label %common.ret
+
+ exit1: ; preds = %if.else
+ %3 = call double @foo()
+ br label %common.ret
+ }
+...
+# Following code has a load from constant pool. Accessing constant pool
+# must not be considered as a stack access and hence, shrink wrapping must
+# happen.
+# CHECK-LABEL:name: shrink_wrap_load_from_const_pool
+# CHECK: savePoint:
+# CHECK: - point: '%bb.3'
+# CHECK: restorePoint:
+# CHECK: - point: '%bb.5'
+---
+name: shrink_wrap_load_from_const_pool
+tracksRegLiveness: true
+constants:
+ - id: 0
+ value: 'double 3.125500e+02'
+ alignment: 8
+body: |
+ bb.0.entry:
+ successors: %bb.4(0x50000000), %bb.2(0x30000000)
+ liveins: $d0
+
+ renamable $d1 = COPY $d0
+ renamable $x8 = ADRP target-flags(aarch64-page) %const.0
+ renamable $d2 = LDRDui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) %const.0 :: (load (s64) from constant-pool)
+ renamable $d0 = FMOVD0
+ nofpexcept FCMPDrr killed renamable $d1, killed renamable $d2, implicit-def $nzcv, implicit $fpcr
+ Bcc 1, %bb.2, implicit killed $nzcv
+
+ bb.4:
+ liveins: $d0
+
+ bb.1.common.ret:
+ liveins: $d0
+
+ RET_ReallyLR implicit $d0
+
+ bb.2.if.else:
+ successors: %bb.3(0x50000000), %bb.1(0x30000000)
+
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ renamable $d1 = COPY $d0
+ renamable $d0 = FMOVD0
+ nofpexcept FCMPDri killed renamable $d1, implicit-def $nzcv, implicit $fpcr
+ Bcc 1, %bb.1, implicit killed $nzcv
+ B %bb.3
+
+ bb.3.exit1:
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+ ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ B %bb.1
+...
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
index 66ac04e..22abb8c 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
@@ -64,6 +64,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
index e5725bc..d689a76 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
@@ -158,10 +158,10 @@ eh.resume: ; preds = %lpad.body
resume { ptr, i32 } %eh.lpad-body
}
-attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind willreturn }
attributes #2 = { nounwind readnone }
-attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
attributes #4 = { nounwind }
attributes #5 = { noreturn }
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
index 91adf82..7483622 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
@@ -77,6 +77,6 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1
declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
index 523eda61..e41d82c 100644
--- a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -54,7 +54,7 @@ declare void @foo3(ptr)
; Function Attrs: nounwind
declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
-attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
;--- pic.ll
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
index 623ea22..89b3b89 100644
--- a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
+++ b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -24,7 +24,7 @@ define void @fn(ptr %argA, ptr %argB, ptr %a) #0 align 2 {
; CHECK: ret
-attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "use-soft-float"="false" }
!llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame5.mir b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
index 97c5c85..32580f4 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
@@ -64,9 +64,9 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #3
- attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
- attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #3 = { nounwind }
...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame6.mir b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
index 5ba7842..d76fae1 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame6.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
@@ -47,8 +47,8 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+ attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #2 = { nounwind }
...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame7.mir b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
index 1599098..d4e71d9 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame7.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
@@ -71,8 +71,8 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+ attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #2 = { nounwind }
...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame8.mir b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
index 9de99ac..56f92f2 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame8.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
@@ -29,7 +29,7 @@
ret i32 %add
}
- attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
...
---
diff --git a/llvm/test/CodeGen/AArch64/wineh5.mir b/llvm/test/CodeGen/AArch64/wineh5.mir
index efdd4b0..1c09b78 100644
--- a/llvm/test/CodeGen/AArch64/wineh5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh5.mir
@@ -73,8 +73,8 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #2
- attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
- attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+ attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #2 = { nounwind }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
index 2f631c2..52d0dff 100644
--- a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
+++ b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
@@ -56,9 +56,9 @@
; Function Attrs: nounwind
declare void @llvm.stackprotector(ptr, ptr) #3
- attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #1 = { argmemonly nounwind }
- attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+ attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
attributes #3 = { nounwind }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 5171403..7714c03 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s0, 34, v1
; CHECK-NEXT: s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13..7b81669 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB9_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 12, v0
; CHECK-NEXT: v_cmp_lt_u32_e64 s[0:1], 34, v1
; CHECK-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cbranch_scc0 .LBB17_2
; CHECK-NEXT: ; %bb.1: ; %false
; CHECK-NEXT: s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index b72eba8..8088c1b 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
; CHECK-LABEL: s_add64_32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s2, s4, 0
; CHECK-NEXT: ; return to shader part epilog
%sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_add_u32 s10, s2, s6
-; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT: s_addc_u32 s8, s3, s7
+; CHECK-NEXT: s_add_u32 s6, s2, s6
+; CHECK-NEXT: s_addc_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_add_u32 s0, s0, s4
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s10
-; CHECK-NEXT: v_mov_b32_e32 v5, s8
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_v2i64:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_sub_u32 s10, s2, s6
-; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT: s_subb_u32 s8, s3, s7
+; CHECK-NEXT: s_sub_u32 s6, s2, s6
+; CHECK-NEXT: s_subb_u32 s7, s3, s7
; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
; CHECK-NEXT: s_sub_u32 s0, s0, s4
-; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[6:7], 0
; CHECK-NEXT: s_subb_u32 s1, s1, s5
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v7
; CHECK-NEXT: v_readfirstlane_b32 s2, v6
-; CHECK-NEXT: v_mov_b32_e32 v4, s10
-; CHECK-NEXT: v_mov_b32_e32 v5, s8
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
; CHECK-NEXT: s_mov_b32 s1, s0
; CHECK-NEXT: s_mov_b32 s3, s2
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
; CHECK-LABEL: s_uadd_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, s2
-; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0
; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_uadd_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_p1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
; CHECK-LABEL: s_usub_n1:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, -1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s1, s1, -1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811e..51df8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s0, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s15, s16, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s12
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
@@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s14, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: s_add_u32 s16, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s15, 0, s4
+; GFX6-NEXT: s_addc_u32 s17, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s15
+; GFX6-NEXT: s_mul_i32 s4, s10, s17
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s14
-; GFX6-NEXT: s_add_i32 s16, s4, s5
-; GFX6-NEXT: s_sub_i32 s17, s7, s16
-; GFX6-NEXT: s_mul_i32 s4, s10, s14
+; GFX6-NEXT: s_mul_i32 s5, s11, s16
+; GFX6-NEXT: s_add_i32 s18, s4, s5
+; GFX6-NEXT: s_sub_i32 s14, s7, s18
+; GFX6-NEXT: s_mul_i32 s4, s10, s16
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s18, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s17, s17, s11
-; GFX6-NEXT: s_sub_u32 s19, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s4, s5
+; GFX6-NEXT: s_subb_u32 s19, s14, s11
+; GFX6-NEXT: s_sub_u32 s20, s6, s10
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s20, s10
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s11
+; GFX6-NEXT: s_cselect_b32 s14, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s16, 1
+; GFX6-NEXT: s_addc_u32 s19, s17, 0
+; GFX6-NEXT: s_add_u32 s20, s16, 2
+; GFX6-NEXT: s_addc_u32 s21, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s20, s15
+; GFX6-NEXT: s_cselect_b32 s15, s21, s19
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s17, 0
+; GFX6-NEXT: s_subb_u32 s4, s7, s18
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s19, s10
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s4, s11
-; GFX6-NEXT: s_cselect_b32 s4, s17, s5
-; GFX6-NEXT: s_add_u32 s5, s14, 1
-; GFX6-NEXT: s_addc_u32 s17, s15, 0
-; GFX6-NEXT: s_add_u32 s19, s14, 2
-; GFX6-NEXT: s_addc_u32 s20, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s4, s19, s5
-; GFX6-NEXT: s_cselect_b32 s5, s20, s17
-; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_subb_u32 s7, s7, s16
-; GFX6-NEXT: s_cmp_ge_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s16, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s10
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s7, s11
-; GFX6-NEXT: s_cselect_b32 s6, s6, s16
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s5, s5, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s14
+; GFX6-NEXT: s_cmp_eq_u32 s4, s11
+; GFX6-NEXT: s_cselect_b32 s4, s6, s5
+; GFX6-NEXT: s_cmp_lg_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s5, s15, s17
+; GFX6-NEXT: s_cselect_b32 s4, s14, s16
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
@@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s10, 0, s8
-; GFX9-NEXT: s_subb_u32 s11, 0, s9
+; GFX9-NEXT: s_sub_u32 s4, 0, s8
+; GFX9-NEXT: s_subb_u32 s5, 0, s9
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT: s_mul_i32 s13, s11, s4
-; GFX9-NEXT: s_add_i32 s5, s14, s5
-; GFX9-NEXT: s_mul_i32 s15, s10, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT: s_mul_i32 s16, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s10, v2
+; GFX9-NEXT: v_readfirstlane_b32 s11, v1
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT: s_mul_i32 s13, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s15, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s13
+; GFX9-NEXT: s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
; GFX9-NEXT: s_add_u32 s14, s14, s16
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT: s_mul_i32 s15, s10, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s5, s12, s5
-; GFX9-NEXT: s_add_u32 s5, s13, s5
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s12, s13, s12
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s4, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s11, s11, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s11
-; GFX9-NEXT: s_mul_i32 s10, s10, s14
-; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s12, s10
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_add_u32 s11, s11, s12
+; GFX9-NEXT: s_addc_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_i32 s12, s4, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s5, s5, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT: s_mul_i32 s14, s10, s4
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT: s_addc_u32 s4, s15, s13
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s10, s4
-; GFX9-NEXT: s_addc_u32 s10, 0, s5
-; GFX9-NEXT: s_add_u32 s11, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s11, s11, s4
+; GFX9-NEXT: s_addc_u32 s10, s10, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_addc_u32 s11, s12, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
; GFX9-NEXT: s_mul_i32 s10, s3, s10
-; GFX9-NEXT: s_add_u32 s14, s11, s10
-; GFX9-NEXT: s_addc_u32 s15, 0, s12
-; GFX9-NEXT: s_mul_i32 s10, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT: s_add_u32 s13, s11, s10
+; GFX9-NEXT: s_addc_u32 s12, 0, s12
+; GFX9-NEXT: s_mul_i32 s10, s8, s12
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s13
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s9, s14
-; GFX9-NEXT: s_add_i32 s16, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s3, s16
-; GFX9-NEXT: s_mul_i32 s10, s8, s14
+; GFX9-NEXT: s_mul_i32 s11, s9, s13
+; GFX9-NEXT: s_add_i32 s14, s10, s11
+; GFX9-NEXT: s_sub_i32 s15, s3, s14
+; GFX9-NEXT: s_mul_i32 s10, s8, s13
; GFX9-NEXT: s_sub_u32 s2, s2, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s17, s12, s9
-; GFX9-NEXT: s_sub_u32 s18, s2, s8
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s17, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s18, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, s9
+; GFX9-NEXT: s_sub_u32 s16, s2, s8
+; GFX9-NEXT: s_subb_u32 s15, s15, 0
+; GFX9-NEXT: s_cmp_ge_u32 s15, s9
; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s9
-; GFX9-NEXT: s_cselect_b32 s12, s17, s13
-; GFX9-NEXT: s_add_u32 s13, s14, 1
-; GFX9-NEXT: s_addc_u32 s17, s15, 0
-; GFX9-NEXT: s_add_u32 s18, s14, 2
-; GFX9-NEXT: s_addc_u32 s19, s15, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s18, s13
-; GFX9-NEXT: s_cselect_b32 s13, s19, s17
+; GFX9-NEXT: s_cmp_ge_u32 s16, s8
+; GFX9-NEXT: s_cselect_b32 s16, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s15, s9
+; GFX9-NEXT: s_cselect_b32 s15, s16, s17
+; GFX9-NEXT: s_add_u32 s16, s13, 1
+; GFX9-NEXT: s_addc_u32 s17, s12, 0
+; GFX9-NEXT: s_add_u32 s18, s13, 2
+; GFX9-NEXT: s_addc_u32 s19, s12, 0
+; GFX9-NEXT: s_cmp_lg_u32 s15, 0
+; GFX9-NEXT: s_cselect_b32 s15, s18, s16
+; GFX9-NEXT: s_cselect_b32 s16, s19, s17
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s16
+; GFX9-NEXT: s_subb_u32 s3, s3, s14
; GFX9-NEXT: s_cmp_ge_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s2, s8
@@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s3, s9
; GFX9-NEXT: s_cselect_b32 s2, s2, s10
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
-; GFX9-NEXT: s_cselect_b32 s3, s13, s15
-; GFX9-NEXT: s_cselect_b32 s2, s12, s14
+; GFX9-NEXT: s_cselect_b32 s3, s16, s12
+; GFX9-NEXT: s_cselect_b32 s2, s15, s13
; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_sub_u32 s2, s2, s4
@@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s17, 0, s18
; GFX6-NEXT: s_add_u32 s18, s12, s13
; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s16, s16, s17
; GFX6-NEXT: s_mul_i32 s12, s14, s16
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
@@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s15, s18, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_addc_u32 s14, s16, s14
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
@@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s17, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s17
+; GFX6-NEXT: s_add_u32 s18, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s16
+; GFX6-NEXT: s_addc_u32 s19, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s19
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s17
-; GFX6-NEXT: s_add_i32 s18, s14, s15
-; GFX6-NEXT: s_sub_i32 s19, s9, s18
-; GFX6-NEXT: s_mul_i32 s14, s6, s17
+; GFX6-NEXT: s_mul_i32 s15, s7, s18
+; GFX6-NEXT: s_add_i32 s20, s14, s15
+; GFX6-NEXT: s_sub_i32 s16, s9, s20
+; GFX6-NEXT: s_mul_i32 s14, s6, s18
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s7
-; GFX6-NEXT: s_sub_u32 s21, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s14, s15
+; GFX6-NEXT: s_subb_u32 s21, s16, s7
+; GFX6-NEXT: s_sub_u32 s22, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT: s_or_b32 s16, s16, s17
+; GFX6-NEXT: s_subb_u32 s16, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s17, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s16, s7
+; GFX6-NEXT: s_cselect_b32 s16, s21, s17
+; GFX6-NEXT: s_add_u32 s17, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s16, 0
+; GFX6-NEXT: s_cselect_b32 s16, s22, s17
+; GFX6-NEXT: s_cselect_b32 s17, s23, s21
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s7
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s17, 1
-; GFX6-NEXT: s_addc_u32 s19, s16, 0
-; GFX6-NEXT: s_add_u32 s21, s17, 2
-; GFX6-NEXT: s_addc_u32 s22, s16, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_cselect_b32 s15, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s18
+; GFX6-NEXT: s_subb_u32 s9, s9, s20
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s18
+; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s15, s16
-; GFX6-NEXT: s_cselect_b32 s6, s14, s17
+; GFX6-NEXT: s_cselect_b32 s7, s17, s19
+; GFX6-NEXT: s_cselect_b32 s6, s16, s18
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s14, s6, s2
-; GFX6-NEXT: s_subb_u32 s15, s7, s3
+; GFX6-NEXT: s_sub_u32 s16, s6, s2
+; GFX6-NEXT: s_subb_u32 s17, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s16
+; GFX6-NEXT: s_mul_i32 s1, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s13, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s17, s12, s2
+; GFX6-NEXT: s_mul_i32 s15, s12, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s18, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s17, s16, s17
+; GFX6-NEXT: s_mul_i32 s15, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s17
+; GFX6-NEXT: s_add_u32 s4, s4, s15
; GFX6-NEXT: s_addc_u32 s4, s5, s18
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s16, s3
+; GFX6-NEXT: s_mul_i32 s3, s14, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s16, s4
+; GFX6-NEXT: s_addc_u32 s4, s14, s4
; GFX6-NEXT: s_mul_i32 s2, s12, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
-; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s13, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: v_readfirstlane_b32 s12, v3
; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s16, s12
+; GFX6-NEXT: s_addc_u32 s3, s14, s12
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_addc_u32 s12, s12, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
@@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
; GFX6-NEXT: s_addc_u32 s12, s4, s12
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
@@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mov_b32_e32 v2, s13
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s17, v3
+; GFX6-NEXT: v_readfirstlane_b32 s15, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s17, s2
-; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_add_u32 s2, s15, s2
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s17, v1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v1
; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s16, s17
+; GFX6-NEXT: s_addc_u32 s2, s14, s15
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_addc_u32 s13, s13, 0
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s16, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_add_u32 s18, s2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s18
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s17
+; GFX6-NEXT: s_addc_u32 s19, 0, s13
+; GFX6-NEXT: s_mul_i32 s12, s8, s19
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s16
-; GFX6-NEXT: s_add_i32 s18, s12, s13
-; GFX6-NEXT: s_sub_i32 s19, s11, s18
-; GFX6-NEXT: s_mul_i32 s12, s8, s16
+; GFX6-NEXT: s_mul_i32 s13, s9, s18
+; GFX6-NEXT: s_add_i32 s20, s12, s13
+; GFX6-NEXT: s_sub_i32 s14, s11, s20
+; GFX6-NEXT: s_mul_i32 s12, s8, s18
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s20, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s19, s19, s9
-; GFX6-NEXT: s_sub_u32 s21, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s21, s14, s9
+; GFX6-NEXT: s_sub_u32 s22, s10, s8
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s21, 0
+; GFX6-NEXT: s_cmp_ge_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s15, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s22, s8
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s14, s9
+; GFX6-NEXT: s_cselect_b32 s14, s21, s15
+; GFX6-NEXT: s_add_u32 s15, s18, 1
+; GFX6-NEXT: s_addc_u32 s21, s19, 0
+; GFX6-NEXT: s_add_u32 s22, s18, 2
+; GFX6-NEXT: s_addc_u32 s23, s19, 0
+; GFX6-NEXT: s_cmp_lg_u32 s14, 0
+; GFX6-NEXT: s_cselect_b32 s14, s22, s15
+; GFX6-NEXT: s_cselect_b32 s15, s23, s21
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s21, s8
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s12, s9
-; GFX6-NEXT: s_cselect_b32 s12, s19, s13
-; GFX6-NEXT: s_add_u32 s13, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s21, s16, 2
-; GFX6-NEXT: s_addc_u32 s22, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_cselect_b32 s12, s21, s13
-; GFX6-NEXT: s_cselect_b32 s13, s22, s19
-; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_subb_u32 s11, s11, s18
+; GFX6-NEXT: s_subb_u32 s11, s11, s20
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
; GFX6-NEXT: s_cselect_b32 s8, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
-; GFX6-NEXT: s_cselect_b32 s8, s8, s18
+; GFX6-NEXT: s_cselect_b32 s8, s8, s12
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s13, s17
-; GFX6-NEXT: s_cselect_b32 s8, s12, s16
+; GFX6-NEXT: s_cselect_b32 s9, s15, s19
+; GFX6-NEXT: s_cselect_b32 s8, s14, s18
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mov_b32_e32 v1, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mov_b32_e32 v1, s17
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s14, 0, s6
-; GFX9-NEXT: s_subb_u32 s15, 0, s7
+; GFX9-NEXT: s_sub_u32 s12, 0, s6
+; GFX9-NEXT: s_subb_u32 s13, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s16, v1
-; GFX9-NEXT: v_readfirstlane_b32 s12, v0
-; GFX9-NEXT: s_mul_i32 s13, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT: s_mul_i32 s17, s15, s12
-; GFX9-NEXT: s_add_i32 s13, s18, s13
-; GFX9-NEXT: s_mul_i32 s19, s14, s12
-; GFX9-NEXT: s_add_i32 s13, s13, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT: s_mul_i32 s20, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT: v_readfirstlane_b32 s14, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v0
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT: s_mul_i32 s17, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s18, s16
+; GFX9-NEXT: s_mul_i32 s19, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16
; GFX9-NEXT: s_add_u32 s18, s18, s20
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT: s_mul_i32 s19, s16, s19
+; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT: s_mul_i32 s19, s14, s19
; GFX9-NEXT: s_add_u32 s18, s18, s19
-; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT: s_mul_hi_u32 s21, s14, s16
; GFX9-NEXT: s_addc_u32 s17, s17, s20
; GFX9-NEXT: s_addc_u32 s18, s21, 0
-; GFX9-NEXT: s_mul_i32 s13, s16, s13
-; GFX9-NEXT: s_add_u32 s13, s17, s13
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s16, s17, s16
; GFX9-NEXT: s_addc_u32 s17, 0, s18
-; GFX9-NEXT: s_add_u32 s18, s12, s13
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_i32 s12, s14, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT: s_add_i32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s15, s15, s18
-; GFX9-NEXT: s_add_i32 s12, s12, s15
-; GFX9-NEXT: s_mul_i32 s14, s14, s18
-; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT: s_mul_i32 s17, s16, s14
-; GFX9-NEXT: s_mul_i32 s20, s18, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT: s_add_u32 s14, s14, s20
+; GFX9-NEXT: s_add_u32 s15, s15, s16
+; GFX9-NEXT: s_addc_u32 s14, s14, s17
+; GFX9-NEXT: s_mul_i32 s16, s12, s14
+; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT: s_add_i32 s16, s17, s16
+; GFX9-NEXT: s_mul_i32 s13, s13, s15
+; GFX9-NEXT: s_add_i32 s16, s16, s13
+; GFX9-NEXT: s_mul_i32 s12, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT: s_mul_i32 s18, s14, s12
+; GFX9-NEXT: s_mul_i32 s20, s15, s16
+; GFX9-NEXT: s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s20
; GFX9-NEXT: s_addc_u32 s19, 0, s19
-; GFX9-NEXT: s_add_u32 s14, s14, s17
-; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT: s_addc_u32 s14, s19, s15
+; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT: s_addc_u32 s12, s19, s17
; GFX9-NEXT: s_addc_u32 s13, s13, 0
-; GFX9-NEXT: s_mul_i32 s12, s16, s12
-; GFX9-NEXT: s_add_u32 s12, s14, s12
-; GFX9-NEXT: s_addc_u32 s14, 0, s13
-; GFX9-NEXT: s_add_u32 s15, s18, s12
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_addc_u32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s16, s14, s16
+; GFX9-NEXT: s_add_u32 s12, s12, s16
+; GFX9-NEXT: s_addc_u32 s13, 0, s13
+; GFX9-NEXT: s_add_u32 s15, s15, s12
+; GFX9-NEXT: s_addc_u32 s14, s14, s13
; GFX9-NEXT: s_ashr_i32 s12, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_mov_b32 s13, s12
@@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_addc_u32 s15, s16, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
; GFX9-NEXT: s_mul_i32 s14, s9, s14
-; GFX9-NEXT: s_add_u32 s18, s15, s14
-; GFX9-NEXT: s_addc_u32 s19, 0, s16
-; GFX9-NEXT: s_mul_i32 s14, s6, s19
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT: s_add_u32 s17, s15, s14
+; GFX9-NEXT: s_addc_u32 s16, 0, s16
+; GFX9-NEXT: s_mul_i32 s14, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s17
; GFX9-NEXT: s_add_i32 s14, s15, s14
-; GFX9-NEXT: s_mul_i32 s15, s7, s18
-; GFX9-NEXT: s_add_i32 s20, s14, s15
-; GFX9-NEXT: s_sub_i32 s16, s9, s20
-; GFX9-NEXT: s_mul_i32 s14, s6, s18
+; GFX9-NEXT: s_mul_i32 s15, s7, s17
+; GFX9-NEXT: s_add_i32 s18, s14, s15
+; GFX9-NEXT: s_sub_i32 s19, s9, s18
+; GFX9-NEXT: s_mul_i32 s14, s6, s17
; GFX9-NEXT: s_sub_u32 s8, s8, s14
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s21, s16, s7
-; GFX9-NEXT: s_sub_u32 s22, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT: s_subb_u32 s16, s21, 0
-; GFX9-NEXT: s_cmp_ge_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s17, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s22, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, s7
+; GFX9-NEXT: s_sub_u32 s20, s8, s6
+; GFX9-NEXT: s_subb_u32 s19, s19, 0
+; GFX9-NEXT: s_cmp_ge_u32 s19, s7
; GFX9-NEXT: s_cselect_b32 s21, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s16, s7
-; GFX9-NEXT: s_cselect_b32 s16, s21, s17
-; GFX9-NEXT: s_add_u32 s17, s18, 1
-; GFX9-NEXT: s_addc_u32 s21, s19, 0
-; GFX9-NEXT: s_add_u32 s22, s18, 2
-; GFX9-NEXT: s_addc_u32 s23, s19, 0
-; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s16, s22, s17
-; GFX9-NEXT: s_cselect_b32 s17, s23, s21
+; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_cselect_b32 s20, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s19, s7
+; GFX9-NEXT: s_cselect_b32 s19, s20, s21
+; GFX9-NEXT: s_add_u32 s20, s17, 1
+; GFX9-NEXT: s_addc_u32 s21, s16, 0
+; GFX9-NEXT: s_add_u32 s22, s17, 2
+; GFX9-NEXT: s_addc_u32 s23, s16, 0
+; GFX9-NEXT: s_cmp_lg_u32 s19, 0
+; GFX9-NEXT: s_cselect_b32 s19, s22, s20
+; GFX9-NEXT: s_cselect_b32 s20, s23, s21
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s20
+; GFX9-NEXT: s_subb_u32 s9, s9, s18
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s14, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s14
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s17, s19
-; GFX9-NEXT: s_cselect_b32 s6, s16, s18
+; GFX9-NEXT: s_cselect_b32 s7, s20, s16
+; GFX9-NEXT: s_cselect_b32 s6, s19, s17
; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT: s_sub_u32 s14, s6, s2
-; GFX9-NEXT: s_subb_u32 s15, s7, s3
+; GFX9-NEXT: s_sub_u32 s12, s6, s2
+; GFX9-NEXT: s_subb_u32 s13, s7, s3
; GFX9-NEXT: s_ashr_i32 s2, s1, 31
; GFX9-NEXT: s_add_u32 s0, s0, s2
; GFX9-NEXT: s_mov_b32 s3, s2
@@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_readfirstlane_b32 s13, v2
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s8, s13
-; GFX9-NEXT: s_mul_i32 s5, s9, s4
-; GFX9-NEXT: s_add_i32 s12, s12, s16
-; GFX9-NEXT: s_add_i32 s12, s12, s5
-; GFX9-NEXT: s_mul_i32 s17, s8, s4
-; GFX9-NEXT: s_mul_i32 s16, s4, s12
-; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT: v_readfirstlane_b32 s8, v1
+; GFX9-NEXT: v_readfirstlane_b32 s15, v2
+; GFX9-NEXT: s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s4, s15
+; GFX9-NEXT: s_mul_i32 s9, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s16
+; GFX9-NEXT: s_add_i32 s14, s14, s9
+; GFX9-NEXT: s_mul_i32 s17, s4, s8
+; GFX9-NEXT: s_mul_i32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT: s_mul_hi_u32 s9, s8, s14
; GFX9-NEXT: s_add_u32 s16, s18, s16
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT: s_mul_i32 s17, s13, s17
+; GFX9-NEXT: s_addc_u32 s9, 0, s9
+; GFX9-NEXT: s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT: s_mul_i32 s17, s15, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT: s_addc_u32 s5, s5, s19
+; GFX9-NEXT: s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT: s_addc_u32 s9, s9, s19
; GFX9-NEXT: s_addc_u32 s16, s18, 0
-; GFX9-NEXT: s_mul_i32 s12, s13, s12
-; GFX9-NEXT: s_add_u32 s5, s5, s12
-; GFX9-NEXT: s_addc_u32 s12, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s13, s12
-; GFX9-NEXT: s_mul_i32 s4, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s16
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s16
-; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT: s_mul_i32 s13, s12, s8
-; GFX9-NEXT: s_mul_i32 s18, s16, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s18
+; GFX9-NEXT: s_mul_i32 s14, s15, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s14
+; GFX9-NEXT: s_addc_u32 s14, 0, s16
+; GFX9-NEXT: s_add_u32 s8, s8, s9
+; GFX9-NEXT: s_addc_u32 s9, s15, s14
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s5, s5, s8
+; GFX9-NEXT: s_add_i32 s14, s14, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT: s_mul_i32 s16, s9, s4
+; GFX9-NEXT: s_mul_i32 s18, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT: s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s8, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT: s_addc_u32 s8, s17, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s16
+; GFX9-NEXT: s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT: s_addc_u32 s4, s17, s15
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s12, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s13, s16, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s8
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s14, s8, s4
+; GFX9-NEXT: s_addc_u32 s15, s9, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s8, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s9, s11, s4
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT: s_mul_i32 s11, s8, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s10, s8, s15
; GFX9-NEXT: s_add_u32 s11, s16, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT: s_mul_i32 s13, s9, s13
-; GFX9-NEXT: s_add_u32 s11, s11, s13
-; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT: s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT: s_mul_i32 s14, s9, s14
+; GFX9-NEXT: s_add_u32 s11, s11, s14
+; GFX9-NEXT: s_mul_hi_u32 s16, s9, s15
; GFX9-NEXT: s_addc_u32 s10, s10, s17
; GFX9-NEXT: s_addc_u32 s11, s16, 0
-; GFX9-NEXT: s_mul_i32 s12, s9, s12
-; GFX9-NEXT: s_add_u32 s16, s10, s12
-; GFX9-NEXT: s_addc_u32 s17, 0, s11
-; GFX9-NEXT: s_mul_i32 s10, s6, s17
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT: s_mul_i32 s14, s9, s15
+; GFX9-NEXT: s_add_u32 s14, s10, s14
+; GFX9-NEXT: s_addc_u32 s15, 0, s11
+; GFX9-NEXT: s_mul_i32 s10, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s11, s6, s14
; GFX9-NEXT: s_add_i32 s10, s11, s10
-; GFX9-NEXT: s_mul_i32 s11, s7, s16
-; GFX9-NEXT: s_add_i32 s18, s10, s11
-; GFX9-NEXT: s_sub_i32 s12, s9, s18
-; GFX9-NEXT: s_mul_i32 s10, s6, s16
+; GFX9-NEXT: s_mul_i32 s11, s7, s14
+; GFX9-NEXT: s_add_i32 s16, s10, s11
+; GFX9-NEXT: s_sub_i32 s17, s9, s16
+; GFX9-NEXT: s_mul_i32 s10, s6, s14
; GFX9-NEXT: s_sub_u32 s8, s8, s10
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s19, s12, s7
-; GFX9-NEXT: s_sub_u32 s20, s8, s6
-; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT: s_subb_u32 s12, s19, 0
-; GFX9-NEXT: s_cmp_ge_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s13, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s20, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, s7
+; GFX9-NEXT: s_sub_u32 s18, s8, s6
+; GFX9-NEXT: s_subb_u32 s17, s17, 0
+; GFX9-NEXT: s_cmp_ge_u32 s17, s7
; GFX9-NEXT: s_cselect_b32 s19, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s12, s7
-; GFX9-NEXT: s_cselect_b32 s12, s19, s13
-; GFX9-NEXT: s_add_u32 s13, s16, 1
-; GFX9-NEXT: s_addc_u32 s19, s17, 0
-; GFX9-NEXT: s_add_u32 s20, s16, 2
-; GFX9-NEXT: s_addc_u32 s21, s17, 0
-; GFX9-NEXT: s_cmp_lg_u32 s12, 0
-; GFX9-NEXT: s_cselect_b32 s12, s20, s13
-; GFX9-NEXT: s_cselect_b32 s13, s21, s19
+; GFX9-NEXT: s_cmp_ge_u32 s18, s6
+; GFX9-NEXT: s_cselect_b32 s18, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s17, s7
+; GFX9-NEXT: s_cselect_b32 s17, s18, s19
+; GFX9-NEXT: s_add_u32 s18, s14, 1
+; GFX9-NEXT: s_addc_u32 s19, s15, 0
+; GFX9-NEXT: s_add_u32 s20, s14, 2
+; GFX9-NEXT: s_addc_u32 s21, s15, 0
+; GFX9-NEXT: s_cmp_lg_u32 s17, 0
+; GFX9-NEXT: s_cselect_b32 s17, s20, s18
+; GFX9-NEXT: s_cselect_b32 s18, s21, s19
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s9, s9, s18
+; GFX9-NEXT: s_subb_u32 s9, s9, s16
; GFX9-NEXT: s_cmp_ge_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s10, -1, 0
; GFX9-NEXT: s_cmp_ge_u32 s8, s6
@@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s9, s7
; GFX9-NEXT: s_cselect_b32 s6, s6, s10
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
-; GFX9-NEXT: s_cselect_b32 s7, s13, s17
-; GFX9-NEXT: s_cselect_b32 s6, s12, s16
+; GFX9-NEXT: s_cselect_b32 s7, s18, s15
+; GFX9-NEXT: s_cselect_b32 s6, s17, s14
; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3]
; GFX9-NEXT: s_sub_u32 s2, s4, s2
; GFX9-NEXT: s_subb_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-NEXT: v_mov_b32_e32 v2, s15
+; GFX9-NEXT: v_mov_b32_e32 v1, s12
+; GFX9-NEXT: v_mov_b32_e32 v2, s13
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_addc_u32 s13, 0, s14
; GFX6-NEXT: s_add_u32 s14, s0, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s13
; GFX6-NEXT: s_mul_i32 s0, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s1, v0
@@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_add_u32 s13, s14, s0
; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_cmp_lg_u32 s0, 0
; GFX6-NEXT: s_addc_u32 s12, s12, s10
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
@@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
; GFX6-NEXT: s_mul_i32 s5, s9, s12
-; GFX6-NEXT: s_add_i32 s13, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s13
+; GFX6-NEXT: s_add_i32 s14, s4, s5
+; GFX6-NEXT: s_sub_i32 s13, s7, s14
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX6-NEXT: s_or_b32 s12, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s15, s6, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT: s_subb_u32 s15, s13, s9
+; GFX6-NEXT: s_sub_u32 s16, s6, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s17, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s15, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s16, s8
+; GFX6-NEXT: s_cselect_b32 s19, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s17, s9
+; GFX6-NEXT: s_cselect_b32 s18, s19, s18
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s15, s15, s9
+; GFX6-NEXT: s_sub_u32 s19, s16, s8
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s18, 0
+; GFX6-NEXT: s_cselect_b32 s13, s19, s16
+; GFX6-NEXT: s_cselect_b32 s12, s12, s17
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s16, s14, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s9
+; GFX6-NEXT: s_subb_u32 s4, s7, s14
+; GFX6-NEXT: s_cmp_ge_u32 s4, s9
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s15, s8
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s9
-; GFX6-NEXT: s_cselect_b32 s17, s17, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s14, s14, s9
-; GFX6-NEXT: s_sub_u32 s18, s15, s8
-; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_subb_u32 s4, s14, 0
-; GFX6-NEXT: s_cmp_lg_u32 s17, 0
-; GFX6-NEXT: s_cselect_b32 s14, s18, s15
-; GFX6-NEXT: s_cselect_b32 s4, s4, s16
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s5, s7, s13
-; GFX6-NEXT: s_cmp_ge_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s8
-; GFX6-NEXT: s_cselect_b32 s8, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s5, s9
-; GFX6-NEXT: s_cselect_b32 s7, s8, s7
-; GFX6-NEXT: s_cmp_lg_u32 s7, 0
-; GFX6-NEXT: s_cselect_b32 s5, s4, s5
-; GFX6-NEXT: s_cselect_b32 s4, s14, s6
+; GFX6-NEXT: s_cselect_b32 s7, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s4, s9
+; GFX6-NEXT: s_cselect_b32 s5, s7, s5
+; GFX6-NEXT: s_cmp_lg_u32 s5, 0
+; GFX6-NEXT: s_cselect_b32 s5, s12, s4
+; GFX6-NEXT: s_cselect_b32 s4, s13, s6
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
@@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s8, 0, s6
-; GFX9-NEXT: s_subb_u32 s9, 0, s7
+; GFX9-NEXT: s_sub_u32 s4, 0, s6
+; GFX9-NEXT: s_subb_u32 s5, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s10, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mul_i32 s5, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT: s_mul_i32 s11, s9, s4
-; GFX9-NEXT: s_add_i32 s5, s12, s5
-; GFX9-NEXT: s_mul_i32 s13, s8, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT: s_mul_i32 s14, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT: v_readfirstlane_b32 s8, v2
+; GFX9-NEXT: v_readfirstlane_b32 s9, v1
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT: s_mul_i32 s11, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s13, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s11
+; GFX9-NEXT: s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s11, s9, s10
; GFX9-NEXT: s_add_u32 s12, s12, s14
; GFX9-NEXT: s_addc_u32 s11, 0, s11
-; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT: s_mul_i32 s13, s10, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT: s_mul_i32 s13, s8, s13
; GFX9-NEXT: s_add_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT: s_mul_hi_u32 s14, s8, s10
; GFX9-NEXT: s_addc_u32 s11, s11, s15
; GFX9-NEXT: s_addc_u32 s12, s14, 0
-; GFX9-NEXT: s_mul_i32 s5, s10, s5
-; GFX9-NEXT: s_add_u32 s5, s11, s5
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s10, s11, s10
; GFX9-NEXT: s_addc_u32 s11, 0, s12
-; GFX9-NEXT: s_add_u32 s12, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_i32 s4, s8, s10
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s9, s9, s12
-; GFX9-NEXT: s_add_i32 s4, s4, s9
-; GFX9-NEXT: s_mul_i32 s8, s8, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT: s_mul_i32 s11, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s12, s4
-; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT: s_add_u32 s8, s8, s14
+; GFX9-NEXT: s_add_u32 s9, s9, s10
+; GFX9-NEXT: s_addc_u32 s8, s8, s11
+; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT: s_add_i32 s10, s11, s10
+; GFX9-NEXT: s_mul_i32 s5, s5, s9
+; GFX9-NEXT: s_add_i32 s10, s10, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s9
+; GFX9-NEXT: s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT: s_mul_i32 s12, s8, s4
+; GFX9-NEXT: s_mul_i32 s14, s9, s10
+; GFX9-NEXT: s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT: s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_add_u32 s8, s8, s11
-; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT: s_addc_u32 s8, s13, s9
+; GFX9-NEXT: s_add_u32 s4, s4, s12
+; GFX9-NEXT: s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT: s_addc_u32 s4, s13, s11
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s10, s4
-; GFX9-NEXT: s_add_u32 s4, s8, s4
-; GFX9-NEXT: s_addc_u32 s8, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s12, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s10, s8
+; GFX9-NEXT: s_mul_i32 s10, s8, s10
+; GFX9-NEXT: s_add_u32 s4, s4, s10
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s9, s9, s4
+; GFX9-NEXT: s_addc_u32 s8, s8, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: s_add_u32 s2, s2, s4
@@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_mul_i32 s8, s6, s8
; GFX9-NEXT: s_sub_u32 s2, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s13, s10, s7
; GFX9-NEXT: s_sub_u32 s14, s2, s6
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s15, s13, 0
; GFX9-NEXT: s_cmp_ge_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
@@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX9-NEXT: s_cmp_eq_u32 s15, s7
; GFX9-NEXT: s_cselect_b32 s16, s17, s16
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s13, s13, s7
-; GFX9-NEXT: s_sub_u32 s17, s14, s6
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s13, 0
+; GFX9-NEXT: s_subb_u32 s10, s13, s7
+; GFX9-NEXT: s_sub_u32 s11, s14, s6
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
-; GFX9-NEXT: s_cselect_b32 s11, s17, s14
+; GFX9-NEXT: s_cselect_b32 s11, s11, s14
; GFX9-NEXT: s_cselect_b32 s10, s10, s15
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s3, s3, s12
@@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_addc_u32 s15, 0, s16
; GFX6-NEXT: s_add_u32 s16, s6, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s14, s14, s15
; GFX6-NEXT: s_mul_i32 s6, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s7, v0
@@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_add_u32 s13, s16, s6
; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_addc_u32 s12, s14, s12
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
@@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s14, v0
; GFX6-NEXT: s_add_i32 s13, s14, s13
; GFX6-NEXT: s_mul_i32 s14, s3, s12
-; GFX6-NEXT: s_add_i32 s14, s13, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: s_add_i32 s16, s13, s14
+; GFX6-NEXT: s_sub_i32 s14, s9, s16
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s17, s8, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s13, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s2
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s3
-; GFX6-NEXT: s_cselect_b32 s19, s19, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s3
-; GFX6-NEXT: s_sub_u32 s20, s17, s2
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s15, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s14, s3
+; GFX6-NEXT: s_sub_u32 s18, s8, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s14, s15
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s2
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s3
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s17, s17, s3
+; GFX6-NEXT: s_sub_u32 s21, s18, s2
+; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT: s_or_b32 s14, s14, s15
+; GFX6-NEXT: s_subb_u32 s14, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s15, s21, s18
+; GFX6-NEXT: s_cselect_b32 s14, s14, s19
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_cmp_lg_u32 s12, 0
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s13, s20, s17
-; GFX6-NEXT: s_cselect_b32 s12, s12, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s2
; GFX6-NEXT: s_cselect_b32 s2, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s3
-; GFX6-NEXT: s_cselect_b32 s2, s2, s14
+; GFX6-NEXT: s_cselect_b32 s2, s2, s12
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_cselect_b32 s3, s12, s9
-; GFX6-NEXT: s_cselect_b32 s2, s13, s8
+; GFX6-NEXT: s_cselect_b32 s3, s14, s9
+; GFX6-NEXT: s_cselect_b32 s2, s15, s8
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT: s_sub_u32 s12, s2, s6
-; GFX6-NEXT: s_subb_u32 s13, s3, s6
+; GFX6-NEXT: s_sub_u32 s14, s2, s6
+; GFX6-NEXT: s_subb_u32 s15, s3, s6
; GFX6-NEXT: s_ashr_i32 s2, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s2
; GFX6-NEXT: s_mov_b32 s3, s2
@@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s14
+; GFX6-NEXT: s_mul_i32 s1, s8, s12
; GFX6-NEXT: v_readfirstlane_b32 s3, v2
; GFX6-NEXT: s_mul_i32 s0, s9, s2
; GFX6-NEXT: s_add_i32 s1, s3, s1
; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s8, s2
+; GFX6-NEXT: s_mul_i32 s13, s8, s2
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; GFX6-NEXT: s_mul_i32 s4, s2, s3
; GFX6-NEXT: v_readfirstlane_b32 s5, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
; GFX6-NEXT: s_add_u32 s4, s16, s4
; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
+; GFX6-NEXT: s_add_u32 s4, s4, s13
; GFX6-NEXT: s_addc_u32 s4, s5, s16
; GFX6-NEXT: v_readfirstlane_b32 s5, v1
; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
+; GFX6-NEXT: s_mul_i32 s3, s12, s3
; GFX6-NEXT: s_add_u32 s3, s4, s3
; GFX6-NEXT: s_addc_u32 s4, 0, s5
; GFX6-NEXT: s_add_u32 s5, s2, s3
; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
+; GFX6-NEXT: s_addc_u32 s4, s12, s4
; GFX6-NEXT: s_mul_i32 s2, s8, s4
; GFX6-NEXT: v_readfirstlane_b32 s3, v0
; GFX6-NEXT: s_add_i32 s2, s3, s2
@@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
; GFX6-NEXT: s_mul_i32 s9, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s9, s15, s9
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: v_readfirstlane_b32 s13, v2
+; GFX6-NEXT: s_add_u32 s9, s13, s9
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: s_addc_u32 s12, 0, s12
; GFX6-NEXT: v_readfirstlane_b32 s8, v3
; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s8
+; GFX6-NEXT: s_addc_u32 s3, s12, s8
; GFX6-NEXT: v_readfirstlane_b32 s8, v1
; GFX6-NEXT: s_addc_u32 s8, s8, 0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
; GFX6-NEXT: s_add_u32 s2, s3, s2
; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s14, s5, s2
+; GFX6-NEXT: s_add_u32 s12, s5, s2
; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_addc_u32 s15, s4, s8
+; GFX6-NEXT: s_addc_u32 s13, s4, s8
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s15
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s14
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT: s_mul_i32 s2, s8, s15
+; GFX6-NEXT: s_mul_i32 s2, s8, s13
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2
; GFX6-NEXT: v_readfirstlane_b32 s11, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: s_add_u32 s2, s11, s2
; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_mul_i32 s11, s9, s14
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_mul_i32 s11, s9, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
; GFX6-NEXT: s_add_u32 s2, s2, s11
-; GFX6-NEXT: s_addc_u32 s2, s10, s14
+; GFX6-NEXT: s_addc_u32 s2, s10, s12
; GFX6-NEXT: v_readfirstlane_b32 s10, v0
; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s11, s9, s15
+; GFX6-NEXT: s_mul_i32 s11, s9, s13
; GFX6-NEXT: s_add_u32 s11, s2, s11
; GFX6-NEXT: v_mov_b32_e32 v0, s11
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX6-NEXT: s_addc_u32 s10, 0, s10
; GFX6-NEXT: s_mul_i32 s10, s6, s10
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
-; GFX6-NEXT: s_add_i32 s10, s14, s10
-; GFX6-NEXT: s_mul_i32 s14, s7, s11
-; GFX6-NEXT: s_add_i32 s14, s10, s14
-; GFX6-NEXT: s_sub_i32 s15, s9, s14
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_add_i32 s10, s12, s10
+; GFX6-NEXT: s_mul_i32 s12, s7, s11
+; GFX6-NEXT: s_add_i32 s16, s10, s12
+; GFX6-NEXT: s_sub_i32 s12, s9, s16
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s17, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s18, s15, 0
-; GFX6-NEXT: s_cmp_ge_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s11, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s17, s6
-; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s18, s7
-; GFX6-NEXT: s_cselect_b32 s19, s19, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s15, s15, s7
-; GFX6-NEXT: s_sub_u32 s20, s17, s6
-; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT: s_or_b32 s13, s10, s11
+; GFX6-NEXT: s_subb_u32 s17, s12, s7
+; GFX6-NEXT: s_sub_u32 s18, s8, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s19, s12, s13
+; GFX6-NEXT: s_subb_u32 s19, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_ge_u32 s18, s6
+; GFX6-NEXT: s_cselect_b32 s21, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s20, s21, s20
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s17, s17, s7
+; GFX6-NEXT: s_sub_u32 s21, s18, s6
+; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT: s_or_b32 s12, s12, s13
+; GFX6-NEXT: s_subb_u32 s12, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s20, 0
+; GFX6-NEXT: s_cselect_b32 s13, s21, s18
+; GFX6-NEXT: s_cselect_b32 s12, s12, s19
; GFX6-NEXT: s_or_b32 s10, s10, s11
-; GFX6-NEXT: s_cmp_lg_u32 s10, 0
-; GFX6-NEXT: s_subb_u32 s10, s15, 0
-; GFX6-NEXT: s_cmp_lg_u32 s19, 0
-; GFX6-NEXT: s_cselect_b32 s11, s20, s17
-; GFX6-NEXT: s_cselect_b32 s10, s10, s18
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_subb_u32 s9, s9, s14
+; GFX6-NEXT: s_subb_u32 s9, s9, s16
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s14, -1, 0
+; GFX6-NEXT: s_cselect_b32 s10, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
; GFX6-NEXT: s_cselect_b32 s6, -1, 0
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
-; GFX6-NEXT: s_cselect_b32 s6, s6, s14
+; GFX6-NEXT: s_cselect_b32 s6, s6, s10
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s10, s9
-; GFX6-NEXT: s_cselect_b32 s6, s11, s8
+; GFX6-NEXT: s_cselect_b32 s7, s12, s9
+; GFX6-NEXT: s_cselect_b32 s6, s13, s8
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX6-NEXT: s_sub_u32 s5, s6, s4
; GFX6-NEXT: s_subb_u32 s4, s7, s4
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mov_b32_e32 v1, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s5
; GFX6-NEXT: v_mov_b32_e32 v3, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT: s_sub_u32 s12, 0, s2
-; GFX9-NEXT: s_subb_u32 s13, 0, s3
+; GFX9-NEXT: s_sub_u32 s6, 0, s2
+; GFX9-NEXT: s_subb_u32 s7, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s14, v1
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s7, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT: s_mul_i32 s15, s13, s6
-; GFX9-NEXT: s_add_i32 s7, s16, s7
-; GFX9-NEXT: s_mul_i32 s17, s12, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT: s_mul_i32 s18, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT: v_readfirstlane_b32 s12, v1
+; GFX9-NEXT: v_readfirstlane_b32 s13, v0
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT: s_mul_i32 s15, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s16, s14
+; GFX9-NEXT: s_mul_i32 s17, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s15
+; GFX9-NEXT: s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s15, s13, s14
; GFX9-NEXT: s_add_u32 s16, s16, s18
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT: s_mul_i32 s17, s14, s17
+; GFX9-NEXT: s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT: s_mul_i32 s17, s12, s17
; GFX9-NEXT: s_add_u32 s16, s16, s17
-; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT: s_mul_hi_u32 s19, s12, s14
; GFX9-NEXT: s_addc_u32 s15, s15, s18
; GFX9-NEXT: s_addc_u32 s16, s19, 0
-; GFX9-NEXT: s_mul_i32 s7, s14, s7
-; GFX9-NEXT: s_add_u32 s7, s15, s7
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s14, s15, s14
; GFX9-NEXT: s_addc_u32 s15, 0, s16
-; GFX9-NEXT: s_add_u32 s16, s6, s7
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_i32 s6, s12, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT: s_add_i32 s6, s7, s6
-; GFX9-NEXT: s_mul_i32 s13, s13, s16
-; GFX9-NEXT: s_add_i32 s6, s6, s13
-; GFX9-NEXT: s_mul_i32 s12, s12, s16
-; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT: s_mul_i32 s15, s14, s12
-; GFX9-NEXT: s_mul_i32 s18, s16, s6
-; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT: s_add_u32 s12, s12, s18
+; GFX9-NEXT: s_add_u32 s13, s13, s14
+; GFX9-NEXT: s_addc_u32 s12, s12, s15
+; GFX9-NEXT: s_mul_i32 s14, s6, s12
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT: s_add_i32 s14, s15, s14
+; GFX9-NEXT: s_mul_i32 s7, s7, s13
+; GFX9-NEXT: s_add_i32 s14, s14, s7
+; GFX9-NEXT: s_mul_i32 s6, s6, s13
+; GFX9-NEXT: s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT: s_mul_i32 s16, s12, s6
+; GFX9-NEXT: s_mul_i32 s18, s13, s14
+; GFX9-NEXT: s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT: s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s18
; GFX9-NEXT: s_addc_u32 s17, 0, s17
-; GFX9-NEXT: s_add_u32 s12, s12, s15
-; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT: s_addc_u32 s12, s17, s13
+; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT: s_addc_u32 s6, s17, s15
; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s6, s14, s6
-; GFX9-NEXT: s_add_u32 s6, s12, s6
-; GFX9-NEXT: s_addc_u32 s12, 0, s7
-; GFX9-NEXT: s_add_u32 s13, s16, s6
-; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT: s_addc_u32 s12, s14, s12
+; GFX9-NEXT: s_mul_i32 s14, s12, s14
+; GFX9-NEXT: s_add_u32 s6, s6, s14
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_add_u32 s13, s13, s6
+; GFX9-NEXT: s_addc_u32 s12, s12, s7
; GFX9-NEXT: s_ashr_i32 s6, s9, 31
; GFX9-NEXT: s_add_u32 s8, s8, s6
; GFX9-NEXT: s_mov_b32 s7, s6
@@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s12, s2, s12
; GFX9-NEXT: s_sub_u32 s8, s8, s12
; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s17, s14, s3
; GFX9-NEXT: s_sub_u32 s18, s8, s2
; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
; GFX9-NEXT: s_subb_u32 s19, s17, 0
; GFX9-NEXT: s_cmp_ge_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, -1, 0
@@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s19, s3
; GFX9-NEXT: s_cselect_b32 s20, s21, s20
; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s17, s17, s3
-; GFX9-NEXT: s_sub_u32 s21, s18, s2
-; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT: s_subb_u32 s14, s17, 0
+; GFX9-NEXT: s_subb_u32 s14, s17, s3
+; GFX9-NEXT: s_sub_u32 s15, s18, s2
+; GFX9-NEXT: s_subb_u32 s14, s14, 0
; GFX9-NEXT: s_cmp_lg_u32 s20, 0
-; GFX9-NEXT: s_cselect_b32 s15, s21, s18
+; GFX9-NEXT: s_cselect_b32 s15, s15, s18
; GFX9-NEXT: s_cselect_b32 s14, s14, s19
; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0
; GFX9-NEXT: s_subb_u32 s9, s9, s16
@@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT: s_sub_u32 s6, 0, s2
-; GFX9-NEXT: s_subb_u32 s7, 0, s3
+; GFX9-NEXT: s_sub_u32 s4, 0, s2
+; GFX9-NEXT: s_subb_u32 s5, 0, s3
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
@@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_readfirstlane_b32 s6, v1
; GFX9-NEXT: v_readfirstlane_b32 s9, v2
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s6, s9
-; GFX9-NEXT: s_mul_i32 s5, s7, s4
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s4, s9
+; GFX9-NEXT: s_mul_i32 s7, s5, s6
; GFX9-NEXT: s_add_i32 s8, s8, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s5
-; GFX9-NEXT: s_mul_i32 s15, s6, s4
-; GFX9-NEXT: s_mul_i32 s14, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT: s_add_i32 s8, s8, s7
+; GFX9-NEXT: s_mul_i32 s15, s4, s6
+; GFX9-NEXT: s_mul_i32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT: s_mul_hi_u32 s7, s6, s8
; GFX9-NEXT: s_add_u32 s14, s16, s14
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15
; GFX9-NEXT: s_mul_i32 s15, s9, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT: s_addc_u32 s5, s5, s17
+; GFX9-NEXT: s_addc_u32 s7, s7, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
; GFX9-NEXT: s_mul_i32 s8, s9, s8
-; GFX9-NEXT: s_add_u32 s5, s5, s8
+; GFX9-NEXT: s_add_u32 s7, s7, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s4, s5
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s4, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT: s_add_i32 s4, s5, s4
-; GFX9-NEXT: s_mul_i32 s7, s7, s14
-; GFX9-NEXT: s_add_i32 s4, s4, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s14
-; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT: s_mul_i32 s9, s8, s6
-; GFX9-NEXT: s_mul_i32 s16, s14, s4
-; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT: s_add_u32 s6, s6, s16
+; GFX9-NEXT: s_add_u32 s6, s6, s7
+; GFX9-NEXT: s_addc_u32 s7, s9, s8
+; GFX9-NEXT: s_mul_i32 s8, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT: s_add_i32 s8, s9, s8
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_add_i32 s8, s8, s5
+; GFX9-NEXT: s_mul_i32 s4, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT: s_mul_i32 s14, s7, s4
+; GFX9-NEXT: s_mul_i32 s16, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT: s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s6, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT: s_addc_u32 s6, s15, s7
+; GFX9-NEXT: s_add_u32 s4, s4, s14
+; GFX9-NEXT: s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT: s_addc_u32 s4, s15, s9
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_mul_i32 s4, s8, s4
-; GFX9-NEXT: s_add_u32 s4, s6, s4
-; GFX9-NEXT: s_addc_u32 s6, 0, s5
-; GFX9-NEXT: s_add_u32 s9, s14, s4
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s8, s8, s6
+; GFX9-NEXT: s_mul_i32 s8, s7, s8
+; GFX9-NEXT: s_add_u32 s4, s4, s8
+; GFX9-NEXT: s_addc_u32 s5, 0, s5
+; GFX9-NEXT: s_add_u32 s8, s6, s4
+; GFX9-NEXT: s_addc_u32 s9, s7, s5
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_add_u32 s6, s10, s4
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_addc_u32 s7, s11, s4
; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT: s_mul_i32 s11, s6, s8
-; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT: s_mul_i32 s11, s6, s9
+; GFX9-NEXT: s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s6, s9
; GFX9-NEXT: s_add_u32 s11, s14, s11
; GFX9-NEXT: s_addc_u32 s10, 0, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT: s_mul_i32 s9, s7, s9
-; GFX9-NEXT: s_add_u32 s9, s11, s9
-; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT: s_addc_u32 s9, s10, s15
-; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_hi_u32 s15, s7, s8
; GFX9-NEXT: s_mul_i32 s8, s7, s8
-; GFX9-NEXT: s_add_u32 s8, s9, s8
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT: s_addc_u32 s8, s10, s15
+; GFX9-NEXT: s_addc_u32 s10, s14, 0
+; GFX9-NEXT: s_mul_i32 s9, s7, s9
+; GFX9-NEXT: s_add_u32 s8, s8, s9
; GFX9-NEXT: s_addc_u32 s9, 0, s10
; GFX9-NEXT: s_mul_i32 s9, s2, s9
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8
@@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_mul_i32 s8, s2, s8
; GFX9-NEXT: s_sub_u32 s6, s6, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s15, s10, s3
; GFX9-NEXT: s_sub_u32 s16, s6, s2
; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
; GFX9-NEXT: s_subb_u32 s17, s15, 0
; GFX9-NEXT: s_cmp_ge_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, -1, 0
@@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cmp_eq_u32 s17, s3
; GFX9-NEXT: s_cselect_b32 s18, s19, s18
; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s15, s15, s3
-; GFX9-NEXT: s_sub_u32 s19, s16, s2
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s15, 0
+; GFX9-NEXT: s_subb_u32 s10, s15, s3
+; GFX9-NEXT: s_sub_u32 s11, s16, s2
+; GFX9-NEXT: s_subb_u32 s10, s10, 0
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
-; GFX9-NEXT: s_cselect_b32 s11, s19, s16
+; GFX9-NEXT: s_cselect_b32 s11, s11, s16
; GFX9-NEXT: s_cselect_b32 s10, s10, s17
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 394727c..01f4414 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB3_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB3_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 258bc295..9db6d70 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT: s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1
; GFX1064_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1164_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s7
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s6, s6, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132_ITERATIVE-NEXT: s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s6, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v2, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s6, s6, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s7, s7, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s6, s6, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s7, s7, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX8_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
+; GFX9_ITERATIVE-NEXT: s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr2
; GFX1164_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s6, s[0:1]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s6
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v2, s12, s2
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s6
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_add_i32 s12, s12, s7
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1132_ITERATIVE-NEXT: s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1264_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s7
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s7
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xf1ff
; GFX1264_ITERATIVE-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0
; GFX1232_ITERATIVE-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_add_co_i32 s6, s6, s2
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s2, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1064_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s1, s0
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1032_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s2
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s2
; GFX1164_ITERATIVE-NEXT: s_add_u32 s8, s8, s3
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s9, s9, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v5, s9, s1
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v4, s8, s1
; GFX1132_ITERATIVE-NEXT: s_add_u32 s8, s8, s2
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s9, s9, s3
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1264_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1264_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s10
; GFX1264_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[8:9]
; GFX1264_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1264_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1264_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1264_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX1232_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: s_ctz_i32_b32 s1, s0
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s3, v3, s1
; GFX1232_ITERATIVE-NEXT: v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v1, s7, s1
; GFX1232_ITERATIVE-NEXT: v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT: s_lshl_b32 s1, 1, s1
; GFX1232_ITERATIVE-NEXT: s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT: s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[2:3]
; GFX1232_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1232_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1232_ITERATIVE-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f..6167a84 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s4
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s4
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s5
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX8_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX9_ITERATIVE-NEXT: .LBB11_1: ; %ComputeLoop
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s4
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[4:5]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s6, s[2:3]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v0, s6
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s6
; GFX1064_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1064_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
; GFX1032_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v2, s6
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_add_u32 s0, s0, s7
; GFX1164_ITERATIVE-NEXT: s_addc_u32 s1, s1, s8
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_add_u32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_addc_u32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB16_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB18_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX8_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v2, s1, m0
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s0, m0
; GFX9_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1064_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164_ITERATIVE-NEXT: .LBB20_1: ; %ComputeLoop
; GFX1164_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s8, s[2:3]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[8:9], 1, s8
; GFX1164_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB21_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB23_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB24_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s1, s1, s7
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB26_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB27_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB29_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX8_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9_ITERATIVE-NEXT: s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
; GFX9_ITERATIVE-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT: s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064_ITERATIVE-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX1064_ITERATIVE-NEXT: v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1064_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1032_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s2
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1164_ITERATIVE-NEXT: v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX1164_ITERATIVE-NEXT: s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s2, 1, s2
; GFX1132_ITERATIVE-NEXT: s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s2
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB30_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX8_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX8_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX8_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX8_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX9_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s10
; GFX9_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s8
; GFX9_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX9_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1064_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1064_ITERATIVE-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1064_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1032_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1164_ITERATIVE-NEXT: s_lshl_b64 s[6:7], 1, s10
; GFX1164_ITERATIVE-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1164_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_cselect_b32 s0, s0, s6
; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s3, 1, s3
; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB32_1
; GFX1132_ITERATIVE-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132_ITERATIVE-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e4def28..9afc0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB6_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB6_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 39a3c9a..10fd34f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB2_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB2_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX8-NEXT: s_mov_b32 m0, s3
-; GFX8-NEXT: v_readlane_b32 s8, v0, s3
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT: v_readlane_b32 s6, v0, s3
; GFX8-NEXT: v_writelane_b32 v1, s2, m0
-; GFX8-NEXT: s_add_i32 s2, s2, s8
+; GFX8-NEXT: s_add_i32 s2, s2, s6
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX9-NEXT: s_mov_b32 m0, s3
-; GFX9-NEXT: v_readlane_b32 s8, v0, s3
-; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT: v_readlane_b32 s6, v0, s3
; GFX9-NEXT: v_writelane_b32 v1, s2, m0
-; GFX9-NEXT: s_add_i32 s2, s2, s8
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W64-NEXT: s_ff1_i32_b64 s3, s[0:1]
; GFX10W64-NEXT: v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX10W64-NEXT: s_add_i32 s2, s2, s8
-; GFX10W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
-; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT: s_andn2_b32 s1, s1, s2
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX11W64-NEXT: s_add_i32 s2, s2, s8
-; GFX11W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
-; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: s_ctz_i32_b64 s3, s[0:1]
; GFX12W64-NEXT: s_wait_alu 0xfffe
; GFX12W64-NEXT: v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX12W64-NEXT: s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[6:7]
; GFX12W64-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W64-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: ; implicit-def: $vgpr0
; GFX12W32-NEXT: .LBB7_1: ; %ComputeLoop
; GFX12W32-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_lshl_b32 s2, 1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
-; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s2
; GFX12W32-NEXT: s_cbranch_scc1 .LBB7_1
; GFX12W32-NEXT: ; %bb.2: ; %ComputeEnd
; GFX12W32-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4a6fa4f..b96de17 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_add_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
-; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_addc_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_add_u32 s0, s12, s14
+; GFX9-NEXT: s_addc_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_add_u32 s0, s12, s14
-; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT: v_mov_b32_e32 v0, s0
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
; GFX1010-NEXT: s_addc_u32 s1, s13, s15
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
; GFX1030W32-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX1030W64-NEXT: s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
; GFX1250-NEXT: s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_sub_u32 s4, s4, s6
; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
; CISI-NEXT: s_or_b32 s6, s12, s13
-; CISI-NEXT: s_cmp_lg_u32 s6, 0
; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_subb_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
; GFX1010-NEXT: s_sub_u32 s0, s12, s14
-; GFX1010-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT: v_mov_b32_e32 v0, s0
-; GFX1010-NEXT: s_cmp_lg_u32 s1, 0
; GFX1010-NEXT: s_subb_u32 s1, s13, s15
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
; GFX1010-NEXT: s_cselect_b32 s0, -1, 0
; GFX1010-NEXT: v_mov_b32_e32 v1, s1
; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W32-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT: s_cmp_lg_u32 s6, 0
; GFX1030W32-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W32-NEXT: s_cselect_b32 s4, -1, 0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX1030W64-NEXT: s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX1030W64-NEXT: s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s4
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s5
; GFX1030W64-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_cmp_lg_u32 s1, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
; GFX1250-NEXT: v_mov_b32_e32 v1, s1
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; VI-NEXT: s_addc_u32 s6, s7, s9
; VI-NEXT: s_addc_u32 s8, s8, 0
; VI-NEXT: v_readfirstlane_b32 s7, v0
-; VI-NEXT: s_add_u32 s12, s6, s7
-; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: s_add_u32 s10, s6, s7
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT: s_addc_u32 s13, 0, s8
-; VI-NEXT: s_mul_i32 s8, s4, s13
+; VI-NEXT: s_addc_u32 s11, 0, s8
+; VI-NEXT: s_mul_i32 s8, s4, s11
; VI-NEXT: v_readfirstlane_b32 s9, v1
; VI-NEXT: s_add_i32 s8, s9, s8
-; VI-NEXT: s_mul_i32 s9, s5, s12
-; VI-NEXT: s_add_i32 s14, s8, s9
-; VI-NEXT: s_sub_i32 s10, s3, s14
+; VI-NEXT: s_mul_i32 s9, s5, s10
+; VI-NEXT: s_add_i32 s12, s8, s9
+; VI-NEXT: s_sub_i32 s13, s3, s12
; VI-NEXT: v_readfirstlane_b32 s8, v0
-; VI-NEXT: s_sub_u32 s15, s2, s8
+; VI-NEXT: s_sub_u32 s14, s2, s8
; VI-NEXT: s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s16, s10, s5
-; VI-NEXT: s_sub_u32 s17, s15, s4
-; VI-NEXT: s_cselect_b64 s[10:11], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[10:11], 0
-; VI-NEXT: s_subb_u32 s10, s16, 0
-; VI-NEXT: s_cmp_ge_u32 s10, s5
-; VI-NEXT: s_cselect_b32 s11, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s17, s4
+; VI-NEXT: s_subb_u32 s13, s13, s5
+; VI-NEXT: s_sub_u32 s15, s14, s4
+; VI-NEXT: s_subb_u32 s13, s13, 0
+; VI-NEXT: s_cmp_ge_u32 s13, s5
; VI-NEXT: s_cselect_b32 s16, -1, 0
-; VI-NEXT: s_cmp_eq_u32 s10, s5
-; VI-NEXT: s_cselect_b32 s10, s16, s11
-; VI-NEXT: s_add_u32 s11, s12, 1
-; VI-NEXT: s_addc_u32 s16, s13, 0
-; VI-NEXT: s_add_u32 s17, s12, 2
-; VI-NEXT: s_addc_u32 s18, s13, 0
-; VI-NEXT: s_cmp_lg_u32 s10, 0
-; VI-NEXT: s_cselect_b32 s10, s17, s11
-; VI-NEXT: s_cselect_b32 s11, s18, s16
+; VI-NEXT: s_cmp_ge_u32 s15, s4
+; VI-NEXT: s_cselect_b32 s15, -1, 0
+; VI-NEXT: s_cmp_eq_u32 s13, s5
+; VI-NEXT: s_cselect_b32 s13, s15, s16
+; VI-NEXT: s_add_u32 s15, s10, 1
+; VI-NEXT: s_addc_u32 s16, s11, 0
+; VI-NEXT: s_add_u32 s17, s10, 2
+; VI-NEXT: s_addc_u32 s18, s11, 0
+; VI-NEXT: s_cmp_lg_u32 s13, 0
+; VI-NEXT: s_cselect_b32 s13, s17, s15
+; VI-NEXT: s_cselect_b32 s15, s18, s16
; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT: s_subb_u32 s3, s3, s14
+; VI-NEXT: s_subb_u32 s3, s3, s12
; VI-NEXT: s_cmp_ge_u32 s3, s5
; VI-NEXT: s_cselect_b32 s8, -1, 0
-; VI-NEXT: s_cmp_ge_u32 s15, s4
+; VI-NEXT: s_cmp_ge_u32 s14, s4
; VI-NEXT: s_cselect_b32 s9, -1, 0
; VI-NEXT: s_cmp_eq_u32 s3, s5
; VI-NEXT: s_cselect_b32 s3, s9, s8
; VI-NEXT: s_cmp_lg_u32 s3, 0
-; VI-NEXT: s_cselect_b32 s9, s11, s13
-; VI-NEXT: s_cselect_b32 s8, s10, s12
+; VI-NEXT: s_cselect_b32 s9, s15, s11
+; VI-NEXT: s_cselect_b32 s8, s13, s10
; VI-NEXT: s_cbranch_execnz .LBB16_4
; VI-NEXT: .LBB16_2:
; VI-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: ; %bb.1:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_sub_u32 s10, 0, s6
-; GFX9-NEXT: s_subb_u32 s11, 0, s7
+; GFX9-NEXT: s_sub_u32 s8, 0, s6
+; GFX9-NEXT: s_subb_u32 s9, 0, s7
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX9-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8
-; GFX9-NEXT: s_mul_i32 s13, s11, s8
-; GFX9-NEXT: s_add_i32 s9, s14, s9
-; GFX9-NEXT: s_add_i32 s9, s9, s13
-; GFX9-NEXT: s_mul_i32 s15, s10, s8
-; GFX9-NEXT: s_mul_i32 s14, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15
-; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9
+; GFX9-NEXT: v_readfirstlane_b32 s10, v1
+; GFX9-NEXT: v_readfirstlane_b32 s11, v0
+; GFX9-NEXT: s_mul_i32 s12, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s14, s8, s11
+; GFX9-NEXT: s_mul_i32 s13, s9, s11
+; GFX9-NEXT: s_add_i32 s12, s14, s12
+; GFX9-NEXT: s_add_i32 s12, s12, s13
+; GFX9-NEXT: s_mul_i32 s15, s8, s11
+; GFX9-NEXT: s_mul_i32 s14, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s16, s11, s15
+; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12
; GFX9-NEXT: s_add_u32 s14, s16, s14
; GFX9-NEXT: s_addc_u32 s13, 0, s13
-; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT: s_mul_i32 s15, s12, s15
+; GFX9-NEXT: s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT: s_mul_i32 s15, s10, s15
; GFX9-NEXT: s_add_u32 s14, s14, s15
-; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9
+; GFX9-NEXT: s_mul_hi_u32 s16, s10, s12
; GFX9-NEXT: s_addc_u32 s13, s13, s17
; GFX9-NEXT: s_addc_u32 s14, s16, 0
-; GFX9-NEXT: s_mul_i32 s9, s12, s9
-; GFX9-NEXT: s_add_u32 s9, s13, s9
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s12, s13, s12
; GFX9-NEXT: s_addc_u32 s13, 0, s14
-; GFX9-NEXT: s_add_u32 s14, s8, s9
-; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_addc_u32 s12, s12, s13
-; GFX9-NEXT: s_mul_i32 s8, s10, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14
-; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s11, s11, s14
-; GFX9-NEXT: s_add_i32 s8, s8, s11
-; GFX9-NEXT: s_mul_i32 s10, s10, s14
-; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT: s_mul_i32 s13, s12, s10
-; GFX9-NEXT: s_mul_i32 s16, s14, s8
-; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8
-; GFX9-NEXT: s_add_u32 s10, s10, s16
+; GFX9-NEXT: s_add_u32 s11, s11, s12
+; GFX9-NEXT: s_addc_u32 s10, s10, s13
+; GFX9-NEXT: s_mul_i32 s12, s8, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s8, s11
+; GFX9-NEXT: s_add_i32 s12, s13, s12
+; GFX9-NEXT: s_mul_i32 s9, s9, s11
+; GFX9-NEXT: s_add_i32 s12, s12, s9
+; GFX9-NEXT: s_mul_i32 s8, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s13, s10, s8
+; GFX9-NEXT: s_mul_i32 s14, s10, s8
+; GFX9-NEXT: s_mul_i32 s16, s11, s12
+; GFX9-NEXT: s_mul_hi_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT: s_add_u32 s8, s8, s16
; GFX9-NEXT: s_addc_u32 s15, 0, s15
-; GFX9-NEXT: s_add_u32 s10, s10, s13
-; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT: s_addc_u32 s10, s15, s11
+; GFX9-NEXT: s_add_u32 s8, s8, s14
+; GFX9-NEXT: s_mul_hi_u32 s9, s10, s12
+; GFX9-NEXT: s_addc_u32 s8, s15, s13
; GFX9-NEXT: s_addc_u32 s9, s9, 0
-; GFX9-NEXT: s_mul_i32 s8, s12, s8
-; GFX9-NEXT: s_add_u32 s8, s10, s8
-; GFX9-NEXT: s_addc_u32 s10, 0, s9
-; GFX9-NEXT: s_add_u32 s11, s14, s8
-; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_addc_u32 s8, s12, s10
-; GFX9-NEXT: s_mul_i32 s10, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11
-; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT: s_add_u32 s10, s12, s10
+; GFX9-NEXT: s_mul_i32 s12, s10, s12
+; GFX9-NEXT: s_add_u32 s8, s8, s12
; GFX9-NEXT: s_addc_u32 s9, 0, s9
-; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11
-; GFX9-NEXT: s_mul_i32 s11, s3, s11
-; GFX9-NEXT: s_add_u32 s10, s10, s11
-; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8
-; GFX9-NEXT: s_addc_u32 s9, s9, s13
-; GFX9-NEXT: s_addc_u32 s10, s12, 0
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_addc_u32 s9, s10, s9
+; GFX9-NEXT: s_mul_i32 s11, s2, s9
+; GFX9-NEXT: s_mul_hi_u32 s12, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT: s_add_u32 s11, s12, s11
+; GFX9-NEXT: s_addc_u32 s10, 0, s10
+; GFX9-NEXT: s_mul_hi_u32 s13, s3, s8
; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s12, s9, s8
-; GFX9-NEXT: s_addc_u32 s13, 0, s10
-; GFX9-NEXT: s_mul_i32 s8, s6, s13
-; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12
+; GFX9-NEXT: s_add_u32 s8, s11, s8
+; GFX9-NEXT: s_mul_hi_u32 s12, s3, s9
+; GFX9-NEXT: s_addc_u32 s8, s10, s13
+; GFX9-NEXT: s_addc_u32 s10, s12, 0
+; GFX9-NEXT: s_mul_i32 s9, s3, s9
+; GFX9-NEXT: s_add_u32 s11, s8, s9
+; GFX9-NEXT: s_addc_u32 s10, 0, s10
+; GFX9-NEXT: s_mul_i32 s8, s6, s10
+; GFX9-NEXT: s_mul_hi_u32 s9, s6, s11
; GFX9-NEXT: s_add_i32 s8, s9, s8
-; GFX9-NEXT: s_mul_i32 s9, s7, s12
-; GFX9-NEXT: s_add_i32 s14, s8, s9
-; GFX9-NEXT: s_sub_i32 s10, s3, s14
-; GFX9-NEXT: s_mul_i32 s8, s6, s12
-; GFX9-NEXT: s_sub_u32 s15, s2, s8
+; GFX9-NEXT: s_mul_i32 s9, s7, s11
+; GFX9-NEXT: s_add_i32 s12, s8, s9
+; GFX9-NEXT: s_sub_i32 s13, s3, s12
+; GFX9-NEXT: s_mul_i32 s8, s6, s11
+; GFX9-NEXT: s_sub_u32 s14, s2, s8
; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s16, s10, s7
-; GFX9-NEXT: s_sub_u32 s17, s15, s6
-; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT: s_subb_u32 s10, s16, 0
-; GFX9-NEXT: s_cmp_ge_u32 s10, s7
-; GFX9-NEXT: s_cselect_b32 s11, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s17, s6
+; GFX9-NEXT: s_subb_u32 s13, s13, s7
+; GFX9-NEXT: s_sub_u32 s15, s14, s6
+; GFX9-NEXT: s_subb_u32 s13, s13, 0
+; GFX9-NEXT: s_cmp_ge_u32 s13, s7
; GFX9-NEXT: s_cselect_b32 s16, -1, 0
-; GFX9-NEXT: s_cmp_eq_u32 s10, s7
-; GFX9-NEXT: s_cselect_b32 s10, s16, s11
-; GFX9-NEXT: s_add_u32 s11, s12, 1
-; GFX9-NEXT: s_addc_u32 s16, s13, 0
-; GFX9-NEXT: s_add_u32 s17, s12, 2
-; GFX9-NEXT: s_addc_u32 s18, s13, 0
-; GFX9-NEXT: s_cmp_lg_u32 s10, 0
-; GFX9-NEXT: s_cselect_b32 s10, s17, s11
-; GFX9-NEXT: s_cselect_b32 s11, s18, s16
+; GFX9-NEXT: s_cmp_ge_u32 s15, s6
+; GFX9-NEXT: s_cselect_b32 s15, -1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s13, s7
+; GFX9-NEXT: s_cselect_b32 s13, s15, s16
+; GFX9-NEXT: s_add_u32 s15, s11, 1
+; GFX9-NEXT: s_addc_u32 s16, s10, 0
+; GFX9-NEXT: s_add_u32 s17, s11, 2
+; GFX9-NEXT: s_addc_u32 s18, s10, 0
+; GFX9-NEXT: s_cmp_lg_u32 s13, 0
+; GFX9-NEXT: s_cselect_b32 s13, s17, s15
+; GFX9-NEXT: s_cselect_b32 s15, s18, s16
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_subb_u32 s3, s3, s14
+; GFX9-NEXT: s_subb_u32 s3, s3, s12
; GFX9-NEXT: s_cmp_ge_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s8, -1, 0
-; GFX9-NEXT: s_cmp_ge_u32 s15, s6
+; GFX9-NEXT: s_cmp_ge_u32 s14, s6
; GFX9-NEXT: s_cselect_b32 s9, -1, 0
; GFX9-NEXT: s_cmp_eq_u32 s3, s7
; GFX9-NEXT: s_cselect_b32 s3, s9, s8
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s9, s11, s13
-; GFX9-NEXT: s_cselect_b32 s8, s10, s12
+; GFX9-NEXT: s_cselect_b32 s9, s15, s10
+; GFX9-NEXT: s_cselect_b32 s8, s13, s11
; GFX9-NEXT: s_cbranch_execnz .LBB16_3
; GFX9-NEXT: .LBB16_2:
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
@@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_add_u32 s11, s12, s11
; GFX1010-NEXT: s_addc_u32 s12, 0, s13
; GFX1010-NEXT: s_add_u32 s8, s8, s11
-; GFX1010-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT: s_mul_i32 s11, s9, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s12
-; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1010-NEXT: s_mul_i32 s12, s9, s8
; GFX1010-NEXT: s_mul_i32 s9, s9, s5
-; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1010-NEXT: s_add_i32 s9, s13, s9
-; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11
+; GFX1010-NEXT: s_mul_i32 s10, s10, s8
+; GFX1010-NEXT: s_add_i32 s9, s11, s9
+; GFX1010-NEXT: s_mul_i32 s11, s5, s12
; GFX1010-NEXT: s_add_i32 s9, s9, s10
-; GFX1010-NEXT: s_mul_i32 s10, s5, s11
+; GFX1010-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1010-NEXT: s_mul_i32 s15, s8, s9
; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT: s_add_u32 s12, s12, s15
+; GFX1010-NEXT: s_add_u32 s10, s10, s15
+; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s12
; GFX1010-NEXT: s_addc_u32 s14, 0, s14
-; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9
-; GFX1010-NEXT: s_add_u32 s10, s12, s10
+; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT: s_add_u32 s10, s10, s11
; GFX1010-NEXT: s_mul_i32 s9, s5, s9
; GFX1010-NEXT: s_addc_u32 s10, s14, s13
-; GFX1010-NEXT: s_addc_u32 s11, s11, 0
+; GFX1010-NEXT: s_addc_u32 s11, s12, 0
; GFX1010-NEXT: s_add_u32 s9, s10, s9
; GFX1010-NEXT: s_addc_u32 s10, 0, s11
; GFX1010-NEXT: s_add_u32 s8, s8, s9
-; GFX1010-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1010-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1010-NEXT: s_addc_u32 s5, s5, s10
-; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1010-NEXT: s_mul_i32 s12, s2, s5
-; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5
-; GFX1010-NEXT: s_add_u32 s11, s11, s12
-; GFX1010-NEXT: s_addc_u32 s10, 0, s10
+; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s5
+; GFX1010-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1010-NEXT: s_mul_i32 s8, s3, s8
+; GFX1010-NEXT: s_add_u32 s9, s9, s12
+; GFX1010-NEXT: s_addc_u32 s11, 0, s11
; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT: s_add_u32 s8, s11, s8
+; GFX1010-NEXT: s_add_u32 s8, s9, s8
; GFX1010-NEXT: s_mul_i32 s5, s3, s5
-; GFX1010-NEXT: s_addc_u32 s8, s10, s9
+; GFX1010-NEXT: s_addc_u32 s8, s11, s10
; GFX1010-NEXT: s_addc_u32 s9, s13, 0
; GFX1010-NEXT: s_add_u32 s5, s8, s5
; GFX1010-NEXT: s_addc_u32 s8, 0, s9
@@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1010-NEXT: s_sub_i32 s11, s3, s9
; GFX1010-NEXT: s_sub_u32 s10, s2, s10
; GFX1010-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT: s_cmp_lg_u32 s12, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, s7
; GFX1010-NEXT: s_sub_u32 s13, s10, s6
-; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1010-NEXT: s_cmp_lg_u32 s14, 0
; GFX1010-NEXT: s_subb_u32 s11, s11, 0
; GFX1010-NEXT: s_cmp_ge_u32 s11, s7
; GFX1010-NEXT: s_cselect_b32 s14, -1, 0
@@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_add_u32 s11, s12, s11
; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13
; GFX1030W32-NEXT: s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1030W32-NEXT: s_mul_i32 s12, s9, s8
; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1030W32-NEXT: s_add_i32 s9, s13, s9
-; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11
+; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT: s_add_i32 s9, s11, s9
+; GFX1030W32-NEXT: s_mul_i32 s11, s7, s12
; GFX1030W32-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9
; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT: s_add_u32 s12, s12, s15
+; GFX1030W32-NEXT: s_add_u32 s10, s10, s15
+; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s12
; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9
-; GFX1030W32-NEXT: s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT: s_add_u32 s10, s10, s11
; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9
; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0
+; GFX1030W32-NEXT: s_addc_u32 s11, s12, 0
; GFX1030W32-NEXT: s_add_u32 s9, s10, s9
; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11
; GFX1030W32-NEXT: s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7
-; GFX1030W32-NEXT: s_add_u32 s11, s11, s12
-; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10
+; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s7
+; GFX1030W32-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT: s_add_u32 s9, s9, s12
+; GFX1030W32-NEXT: s_addc_u32 s11, 0, s11
; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT: s_add_u32 s8, s11, s8
+; GFX1030W32-NEXT: s_add_u32 s8, s9, s8
; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9
+; GFX1030W32-NEXT: s_addc_u32 s8, s11, s10
; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0
; GFX1030W32-NEXT: s_add_u32 s7, s8, s7
; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9
@@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9
; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10
; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5
; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4
-; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0
; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0
; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5
; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0
@@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: ; %bb.1:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4
-; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5
+; GFX1030W64-NEXT: s_sub_u32 s8, 0, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, 0, s5
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0
; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0
-; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6
-; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6
-; GFX1030W64-NEXT: s_add_i32 s7, s12, s7
-; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6
-; GFX1030W64-NEXT: s_add_i32 s7, s7, s11
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13
-; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7
-; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7
+; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v1
+; GFX1030W64-NEXT: v_readfirstlane_b32 s7, v0
+; GFX1030W64-NEXT: s_mul_i32 s10, s8, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s9, s7
+; GFX1030W64-NEXT: s_add_i32 s10, s12, s10
+; GFX1030W64-NEXT: s_mul_i32 s13, s8, s7
+; GFX1030W64-NEXT: s_add_i32 s10, s10, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s7, s13
+; GFX1030W64-NEXT: s_mul_i32 s15, s7, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s14, s6, s13
+; GFX1030W64-NEXT: s_mul_i32 s11, s6, s13
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s10
; GFX1030W64-NEXT: s_add_u32 s12, s12, s15
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7
+; GFX1030W64-NEXT: s_mul_hi_u32 s16, s6, s10
; GFX1030W64-NEXT: s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s10, s6, s10
; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14
; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT: s_add_u32 s7, s11, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT: s_add_u32 s12, s6, s7
-; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12
-; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11
-; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12
-; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8
-; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6
-; GFX1030W64-NEXT: s_add_i32 s9, s13, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6
-; GFX1030W64-NEXT: s_add_i32 s9, s9, s10
-; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6
-; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9
-; GFX1030W64-NEXT: s_add_u32 s7, s7, s14
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s10
+; GFX1030W64-NEXT: s_addc_u32 s6, s6, s11
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s8, s8, s6
+; GFX1030W64-NEXT: s_mul_i32 s9, s9, s7
+; GFX1030W64-NEXT: s_add_i32 s8, s10, s8
+; GFX1030W64-NEXT: s_mul_i32 s10, s6, s11
+; GFX1030W64-NEXT: s_add_i32 s8, s8, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s7, s11
+; GFX1030W64-NEXT: s_mul_i32 s14, s7, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s13, s7, s8
+; GFX1030W64-NEXT: s_add_u32 s9, s9, s14
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s11
; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9
-; GFX1030W64-NEXT: s_add_u32 s6, s7, s6
-; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9
-; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11
-; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0
-; GFX1030W64-NEXT: s_add_u32 s6, s6, s9
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7
-; GFX1030W64-NEXT: s_add_u32 s10, s12, s6
-; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10
-; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9
-; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7
-; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7
-; GFX1030W64-NEXT: s_add_u32 s10, s11, s10
-; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7
-; GFX1030W64-NEXT: s_add_u32 s8, s10, s8
+; GFX1030W64-NEXT: s_mul_hi_u32 s11, s6, s8
+; GFX1030W64-NEXT: s_add_u32 s9, s9, s10
+; GFX1030W64-NEXT: s_mul_i32 s8, s6, s8
+; GFX1030W64-NEXT: s_addc_u32 s9, s13, s12
+; GFX1030W64-NEXT: s_addc_u32 s10, s11, 0
+; GFX1030W64-NEXT: s_add_u32 s8, s9, s8
+; GFX1030W64-NEXT: s_addc_u32 s9, 0, s10
+; GFX1030W64-NEXT: s_add_u32 s7, s7, s8
+; GFX1030W64-NEXT: s_addc_u32 s6, s6, s9
+; GFX1030W64-NEXT: s_mul_hi_u32 s8, s2, s7
+; GFX1030W64-NEXT: s_mul_i32 s11, s2, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s10, s2, s6
+; GFX1030W64-NEXT: s_mul_hi_u32 s9, s3, s7
; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6
+; GFX1030W64-NEXT: s_add_u32 s8, s8, s11
+; GFX1030W64-NEXT: s_addc_u32 s10, 0, s10
+; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s6
+; GFX1030W64-NEXT: s_add_u32 s7, s8, s7
+; GFX1030W64-NEXT: s_mul_i32 s6, s3, s6
+; GFX1030W64-NEXT: s_addc_u32 s7, s10, s9
; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT: s_add_u32 s10, s6, s7
+; GFX1030W64-NEXT: s_add_u32 s10, s7, s6
; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8
; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10
; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11
; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10
; GFX1030W64-NEXT: s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT: s_add_i32 s12, s6, s8
+; GFX1030W64-NEXT: s_add_i32 s8, s6, s8
; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12
-; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6
+; GFX1030W64-NEXT: s_sub_i32 s9, s3, s8
+; GFX1030W64-NEXT: s_sub_u32 s12, s2, s6
; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5
-; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4
-; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5
-; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, s9, s5
+; GFX1030W64-NEXT: s_sub_u32 s13, s12, s4
+; GFX1030W64-NEXT: s_subb_u32 s9, s9, 0
+; GFX1030W64-NEXT: s_cmp_ge_u32 s9, s5
; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5
-; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9
-; GFX1030W64-NEXT: s_add_u32 s9, s10, 1
+; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1030W64-NEXT: s_cmp_eq_u32 s9, s5
+; GFX1030W64-NEXT: s_cselect_b32 s9, s13, s14
+; GFX1030W64-NEXT: s_add_u32 s13, s10, 1
; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0
; GFX1030W64-NEXT: s_add_u32 s15, s10, 2
; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0
-; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9
+; GFX1030W64-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1030W64-NEXT: s_cselect_b32 s13, s15, s13
; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14
; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12
+; GFX1030W64-NEXT: s_subb_u32 s3, s3, s8
; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT: s_cmp_ge_u32 s12, s4
; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0
; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5
; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6
; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0
; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10
+; GFX1030W64-NEXT: s_cselect_b32 s6, s13, s10
; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3
; GFX1030W64-NEXT: .LBB16_2:
; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4
@@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_u32 s11, s12, s11
; GFX11-NEXT: s_addc_u32 s12, 0, s13
; GFX11-NEXT: s_add_u32 s8, s8, s11
-; GFX11-NEXT: s_cselect_b32 s11, -1, 0
-; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT: s_cmp_lg_u32 s11, 0
-; GFX11-NEXT: s_mul_i32 s11, s9, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s12
-; GFX11-NEXT: s_mul_i32 s10, s10, s8
+; GFX11-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX11-NEXT: s_mul_i32 s12, s9, s8
; GFX11-NEXT: s_mul_i32 s9, s9, s7
-; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX11-NEXT: s_add_i32 s9, s13, s9
-; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11
+; GFX11-NEXT: s_mul_i32 s10, s10, s8
+; GFX11-NEXT: s_add_i32 s9, s11, s9
+; GFX11-NEXT: s_mul_i32 s11, s7, s12
; GFX11-NEXT: s_add_i32 s9, s9, s10
-; GFX11-NEXT: s_mul_i32 s10, s7, s11
+; GFX11-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX11-NEXT: s_mul_i32 s15, s8, s9
; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT: s_add_u32 s12, s12, s15
+; GFX11-NEXT: s_add_u32 s10, s10, s15
+; GFX11-NEXT: s_mul_hi_u32 s13, s7, s12
; GFX11-NEXT: s_addc_u32 s14, 0, s14
-; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9
-; GFX11-NEXT: s_add_u32 s10, s12, s10
+; GFX11-NEXT: s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT: s_add_u32 s10, s10, s11
; GFX11-NEXT: s_mul_i32 s9, s7, s9
; GFX11-NEXT: s_addc_u32 s10, s14, s13
-; GFX11-NEXT: s_addc_u32 s11, s11, 0
+; GFX11-NEXT: s_addc_u32 s11, s12, 0
; GFX11-NEXT: s_add_u32 s9, s10, s9
; GFX11-NEXT: s_addc_u32 s10, 0, s11
; GFX11-NEXT: s_add_u32 s8, s8, s9
-; GFX11-NEXT: s_cselect_b32 s9, -1, 0
-; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX11-NEXT: s_cmp_lg_u32 s9, 0
-; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX11-NEXT: s_addc_u32 s7, s7, s10
-; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX11-NEXT: s_mul_i32 s12, s2, s7
-; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7
-; GFX11-NEXT: s_add_u32 s11, s11, s12
-; GFX11-NEXT: s_addc_u32 s10, 0, s10
+; GFX11-NEXT: s_mul_hi_u32 s11, s2, s7
+; GFX11-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX11-NEXT: s_mul_i32 s8, s3, s8
+; GFX11-NEXT: s_add_u32 s9, s9, s12
+; GFX11-NEXT: s_addc_u32 s11, 0, s11
; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT: s_add_u32 s8, s11, s8
+; GFX11-NEXT: s_add_u32 s8, s9, s8
; GFX11-NEXT: s_mul_i32 s7, s3, s7
-; GFX11-NEXT: s_addc_u32 s8, s10, s9
+; GFX11-NEXT: s_addc_u32 s8, s11, s10
; GFX11-NEXT: s_addc_u32 s9, s13, 0
; GFX11-NEXT: s_add_u32 s7, s8, s7
; GFX11-NEXT: s_addc_u32 s8, 0, s9
@@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX11-NEXT: s_add_i32 s9, s9, s10
; GFX11-NEXT: s_mul_i32 s10, s4, s7
; GFX11-NEXT: s_add_i32 s9, s9, s11
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_sub_i32 s11, s3, s9
; GFX11-NEXT: s_sub_u32 s10, s2, s10
; GFX11-NEXT: s_cselect_b32 s12, -1, 0
-; GFX11-NEXT: s_cmp_lg_u32 s12, 0
; GFX11-NEXT: s_subb_u32 s11, s11, s5
; GFX11-NEXT: s_sub_u32 s13, s10, s4
-; GFX11-NEXT: s_cselect_b32 s14, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s14, 0
; GFX11-NEXT: s_subb_u32 s11, s11, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_ge_u32 s11, s5
; GFX11-NEXT: s_cselect_b32 s14, -1, 0
; GFX11-NEXT: s_cmp_ge_u32 s13, s4
@@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
-; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4
; GFX1250-NEXT: ; %bb.1:
; GFX1250-NEXT: s_cvt_f32_u32 s4, s6
@@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_lg_u32 s4, 0
; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9]
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11
; GFX1250-NEXT: s_mul_i32 s12, s8, s11
; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10
@@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11]
; GFX1250-NEXT: s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT: s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT: s_cmp_lg_u32 s10, 0
-; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8
; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11
-; GFX1250-NEXT: s_mul_i32 s11, s3, s8
+; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8
+; GFX1250-NEXT: s_mul_hi_u32 s11, s3, s8
+; GFX1250-NEXT: s_mul_i32 s12, s3, s8
; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10
; GFX1250-NEXT: s_mul_i32 s8, s2, s10
; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9]
; GFX1250-NEXT: s_mul_i32 s10, s3, s10
-; GFX1250-NEXT: s_add_co_u32 s4, s8, s11
-; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12
+; GFX1250-NEXT: s_add_co_u32 s4, s8, s12
+; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s11
; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GFX1250-NEXT: s_cmp_lg_u32 s8, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7
; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6
-; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT: s_cmp_lg_u32 s14, 0
; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-NEXT: s_cmp_ge_u32 s12, s7
; GFX1250-NEXT: s_cselect_b32 s14, -1, 0
; GFX1250-NEXT: s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4b151b9..07e6a76 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: s_lshl_b32 s2, s2, 8
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_flbit_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_cselect_b32 s2, s3, 32
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index cefcbdd..fca57be 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s4, s6, 16
-; SI-NEXT: s_cmp_lg_u32 s4, 0
; SI-NEXT: s_cbranch_scc0 .LBB14_4
; SI-NEXT: ; %bb.1: ; %else
; SI-NEXT: s_mov_b32 s11, 0xf000
@@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s6, 16
-; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d8a5e7fa..dbdea8e 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_add_u32 s7, s6, s6
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_or_b32 s4, s4, s5
-; GFX7-NEXT: s_cmp_lg_u32 s4, 0
; GFX7-NEXT: s_addc_u32 s8, s6, 0
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_add_u32 s7, s6, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
; GFX9-NEXT: s_addc_u32 s8, s6, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s5, s4, s4
-; GFX10-NEXT: s_cselect_b32 s6, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s6, s4, 0
; GFX10-NEXT: s_cselect_b32 s7, -1, 0
; GFX10-NEXT: s_and_b32 s7, s7, exec_lo
@@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_addc_u32 s2, s0, 0
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
; GFX11-NEXT: s_cselect_b32 s2, s2, 0
; GFX11-NEXT: s_cmp_gt_u32 s0, 31
; GFX11-NEXT: s_cselect_b32 s0, s1, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-NEXT: s_add_u32 s0, s2, s2
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_cmp_lg_u32 s0, 0
; GFX7-NEXT: s_addc_u32 s0, s2, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
@@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
;
; GFX9-LABEL: s_add_co_br_user:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s2, s2
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s2, 0
+; GFX9-NEXT: s_add_u32 s1, s0, s0
+; GFX9-NEXT: s_addc_u32 s0, s0, 0
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB1_2
@@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s1, s0, s0
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0
@@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s1, s0, s0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-NEXT: s_addc_u32 s0, s0, 0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 62847b1..9a17538 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s3, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s3, s0
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s3, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s3, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: s_lshr_b32 s0, s1, 8
@@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s3, s0
-; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
@@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
@@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s6, s4, 0xffe
; SI-NEXT: s_and_b32 s4, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s4, s0
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; SI-NEXT: s_and_b32 s5, s0, 0xffe
; SI-NEXT: s_and_b32 s0, s3, 0x1ff
; SI-NEXT: s_or_b32 s0, s0, s2
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; SI-NEXT: v_readfirstlane_b32 s0, v2
@@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s5, s3, 8
-; VI-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-NEXT: s_and_b32 s5, s5, 0xffe
+; VI-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-NEXT: s_or_b32 s2, s6, s2
-; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014
@@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-NEXT: s_and_b32 s7, s2, 0xffe
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s2, s0
-; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
@@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s5, s3, 8
-; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-NEXT: s_or_b32 s2, s6, s2
-; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014
@@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX9-NEXT: s_and_b32 s6, s2, 0xffe
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s2, s0
-; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
;
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT: s_lshr_b32 s6, s3, 8
-; GFX11-NEXT: s_or_b32 s2, s5, s2
-; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_lshr_b32 s5, s3, 8
+; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff
+; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-NEXT: s_or_b32 s2, s6, s2
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-NEXT: s_cselect_b32 s2, s5, s6
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT: s_or_b32 s0, s6, s0
+; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_or_b32 s0, s6, s0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index b0dd187..c28b25c7 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
@@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
@@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
@@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
-; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT: s_or_b32 s4, s9, s6
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
@@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
-; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
-; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
-; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
-; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
-; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
@@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
-; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
-; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
@@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
-; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
-; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-TRUE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s8, s6
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
-; GFX1250-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-TRUE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
@@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX1250-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s2, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s8, 1, s2
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
@@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX1250-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s2, 0x40f
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX1250-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX1250-GISEL-FAKE16-NEXT: s_addk_co_i32 s4, 0xfc10
; GFX1250-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s8, s6
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
-; GFX1250-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX1250-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX1250-GISEL-FAKE16-NEXT: s_sub_co_i32 s6, 1, s4
; GFX1250-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index f1165491..b6b26a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; SI-NEXT: s_and_b32 s1, s7, 0x1ff
; SI-NEXT: s_and_b32 s8, s0, 0xffe
; SI-NEXT: s_or_b32 s0, s1, s6
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
@@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe
; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff
; VI-SDAG-NEXT: s_or_b32 s4, s4, s6
-; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
@@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-SDAG-NEXT: s_lshr_b32 s4, s3, 8
+; GFX10-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX10-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX10-SDAG-NEXT: s_or_b32 s2, s5, s2
; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
@@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX10-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX10-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX10-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX10-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
@@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT: s_lshr_b32 s4, s3, 8
+; GFX11-SDAG-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-SDAG-NEXT: s_and_b32 s4, s4, 0xffe
+; GFX11-SDAG-NEXT: s_or_b32 s2, s5, s2
; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
@@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d1..31f277f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v1, s3, v1
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb3..4581efc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac008..bd570d9 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT: v_readlane_b32 s4, v0, s2
+; GFX9-NEXT: v_readlane_b32 s3, v0, s2
+; GFX9-NEXT: v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT: v_min_f32_e32 v2, v2, v1
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB7_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB9_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX9-NEXT: s_cbranch_scc1 .LBB11_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1032-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1164-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_min_f64 v[4:5], v[2:3], v[4:5]
; GFX1132-NEXT: s_cbranch_scc1 .LBB11_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143..1f2d70c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-NEXT: v_readlane_b32 s4, v0, s2
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
@@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_add_f32_e32 v2, s3, v2
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
@@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index eee232a..c3f3917 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -136,19 +136,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
; GFX11-NEXT: .LBB2_6: ; %bb18
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT: v_readfirstlane_b32 s13, v0
-; GFX11-NEXT: s_cmp_lg_u32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT: s_and_b32 s1, s8, s1
-; GFX11-NEXT: s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_cselect_b32 s13, -1, 0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT: s_and_b32 s13, s8, s13
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_and_b32 s13, s13, exec_lo
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
-; GFX11-NEXT: s_cselect_b32 s1, s19, s13
-; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT: s_cselect_b32 s1, s19, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, s1, 1
-; GFX11-NEXT: s_cmp_lg_u32 s13, 0
+; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 8748aff..6dc9199 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
-; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
+; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB28_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX942-NEXT: .LBB28_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT: v_readfirstlane_b32 s8, v1
-; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: v_readfirstlane_b32 s6, v1
; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: v_readlane_b32 s8, v2, s3
+; GFX942-NEXT: v_writelane_b32 v0, s6, m0
+; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_writelane_b32 v0, s8, m0
-; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB28_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX11-NEXT: .LBB28_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
-; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_lshl_b32 s1, 1, s1
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
; GFX11-NEXT: s_cbranch_scc1 .LBB28_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
-; GFX10-NEXT: s_lshl_b32 s7, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_lshl_b32 s1, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s1
; GFX10-NEXT: s_cbranch_scc1 .LBB28_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX90A-NEXT: .LBB28_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
; GFX90A-NEXT: s_mov_b32 m0, s3
+; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB28_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX908-NEXT: .LBB28_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT: v_readfirstlane_b32 s8, v1
-; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: v_readfirstlane_b32 s6, v1
; GFX908-NEXT: s_mov_b32 m0, s3
+; GFX908-NEXT: v_readlane_b32 s8, v2, s3
+; GFX908-NEXT: v_writelane_b32 v0, s6, m0
+; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_writelane_b32 v0, s8, m0
-; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB28_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
; GFX8-NEXT: .LBB28_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s8, v1
-; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
; GFX8-NEXT: s_mov_b32 m0, s3
+; GFX8-NEXT: v_readlane_b32 s8, v2, s3
+; GFX8-NEXT: v_writelane_b32 v0, s6, m0
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_writelane_b32 v0, s8, m0
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB28_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_readlane_b32 s6, v1, s3
-; GFX12-NEXT: s_lshl_b32 s7, 1, s3
; GFX12-NEXT: v_writelane_b32 v0, s0, s3
+; GFX12-NEXT: s_lshl_b32 s3, 1, s3
; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_cmp_lg_u32 s1, 0
+; GFX12-NEXT: s_and_not1_b32 s1, s1, s3
; GFX12-NEXT: s_add_f32 s0, s0, s6
; GFX12-NEXT: s_cbranch_scc1 .LBB29_5
; GFX12-NEXT: ; %bb.6: ; %ComputeEnd
@@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX942-NEXT: .LBB29_5: ; %ComputeLoop
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT: v_readfirstlane_b32 s8, v1
-; GFX942-NEXT: v_readlane_b32 s9, v2, s3
+; GFX942-NEXT: v_readfirstlane_b32 s6, v1
; GFX942-NEXT: s_mov_b32 m0, s3
+; GFX942-NEXT: v_readlane_b32 s8, v2, s3
+; GFX942-NEXT: v_writelane_b32 v0, s6, m0
+; GFX942-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT: v_writelane_b32 v0, s8, m0
-; GFX942-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT: v_add_f32_e32 v1, s8, v1
; GFX942-NEXT: s_cbranch_scc1 .LBB29_5
; GFX942-NEXT: ; %bb.6: ; %ComputeEnd
; GFX942-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: .LBB29_5: ; %ComputeLoop
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_ctz_i32_b32 s1, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readlane_b32 s6, v2, s1
-; GFX11-NEXT: s_lshl_b32 s7, 1, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 s0, s0, s7
; GFX11-NEXT: v_writelane_b32 v0, s3, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-NEXT: s_lshl_b32 s1, 1, s1
+; GFX11-NEXT: s_and_not1_b32 s0, s0, s1
; GFX11-NEXT: s_cbranch_scc1 .LBB29_5
; GFX11-NEXT: ; %bb.6: ; %ComputeEnd
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: s_ff1_i32_b32 s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s3, v1
; GFX10-NEXT: v_readlane_b32 s6, v2, s1
-; GFX10-NEXT: s_lshl_b32 s7, 1, s1
-; GFX10-NEXT: s_andn2_b32 s0, s0, s7
; GFX10-NEXT: v_writelane_b32 v0, s3, s1
; GFX10-NEXT: v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-NEXT: s_lshl_b32 s1, 1, s1
+; GFX10-NEXT: s_andn2_b32 s0, s0, s1
; GFX10-NEXT: s_cbranch_scc1 .LBB29_5
; GFX10-NEXT: ; %bb.6: ; %ComputeEnd
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: .LBB29_5: ; %ComputeLoop
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT: v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s6, v1
; GFX90A-NEXT: s_mov_b32 m0, s3
+; GFX90A-NEXT: v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT: v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX90A-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT: v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT: v_add_f32_e32 v1, s8, v1
; GFX90A-NEXT: s_cbranch_scc1 .LBB29_5
; GFX90A-NEXT: ; %bb.6: ; %ComputeEnd
; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX908-NEXT: .LBB29_5: ; %ComputeLoop
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT: v_readfirstlane_b32 s8, v1
-; GFX908-NEXT: v_readlane_b32 s9, v2, s3
+; GFX908-NEXT: v_readfirstlane_b32 s6, v1
; GFX908-NEXT: s_mov_b32 m0, s3
+; GFX908-NEXT: v_readlane_b32 s8, v2, s3
+; GFX908-NEXT: v_writelane_b32 v0, s6, m0
+; GFX908-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX908-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT: v_writelane_b32 v0, s8, m0
-; GFX908-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT: v_add_f32_e32 v1, s8, v1
; GFX908-NEXT: s_cbranch_scc1 .LBB29_5
; GFX908-NEXT: ; %bb.6: ; %ComputeEnd
; GFX908-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: .LBB29_5: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s8, v1
-; GFX8-NEXT: v_readlane_b32 s9, v2, s3
+; GFX8-NEXT: v_readfirstlane_b32 s6, v1
; GFX8-NEXT: s_mov_b32 m0, s3
+; GFX8-NEXT: v_readlane_b32 s8, v2, s3
+; GFX8-NEXT: v_writelane_b32 v0, s6, m0
+; GFX8-NEXT: s_lshl_b64 s[6:7], 1, s3
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT: v_writelane_b32 v0, s8, m0
-; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT: v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT: v_add_f32_e32 v1, s8, v1
; GFX8-NEXT: s_cbranch_scc1 .LBB29_5
; GFX8-NEXT: ; %bb.6: ; %ComputeEnd
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index c1cf06e..fba42c4 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,9 +388,8 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_NOP 0, implicit killed $scc
- ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+ ; GCN-NEXT: S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
@@ -417,6 +416,80 @@ body: |
S_ENDPGM 0
...
+---
+name: xor_1_cmp_lg_0_killed_scc
+body: |
+ ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN-NEXT: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
+ S_NOP 0, implicit killed $scc
+ S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+name: absdiff_1_cmp_lg_0_killed_scc
+body: |
+ ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GCN-NEXT: [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_NOP 0, implicit $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0, $vgpr0_vgpr1
+
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
+ S_NOP 0, implicit killed $scc
+ S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
---
name: and_1_cmp_eq_1_clobbered_scc
@@ -2070,8 +2143,7 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
- ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index f53aaaa..dd5f838 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
@@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: shl64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: lshr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: lshr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: ashr32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: ashr64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
; CHECK-LABEL: abs32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_abs_i32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: and32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: and64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: or32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: or64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nand32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nand64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nand_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: nor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: xnor32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: xnor64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_xnor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: andn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: nandn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: orn232:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b32 s0, s0, s1
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: orn264:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_orn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
; CHECK-LABEL: bfe_i32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_i32 s0, s0, 0x80010
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
; CHECK-LABEL: bfe_u32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bfe_u32 s0, s0, 0x80010
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
@@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
; CHECK-LABEL: bcnt132:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
; CHECK-LABEL: quadmask32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
; CHECK-LABEL: quadmask64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_quadmask_b64 s[0:1], s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
; CHECK-LABEL: not32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b32 s0, s0
-; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s0
; CHECK-NEXT: ;;#ASMEND
@@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
; CHECK-LABEL: not64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_not_b64 s[0:1], s[0:1]
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
@@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
%zext = zext i1 %cmp to i32
ret i32 %zext
}
+
+
+; --------------------------------------------------------------------------------
+; Negative tests
+; --------------------------------------------------------------------------------
+
+@1 = extern_weak dso_local addrspace(4) constant i32
+
+define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
+; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getpc_b64 s[0:1]
+; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
+; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT: s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT: ; %bb.1: ; %endif
+; CHECK-NEXT: s_mov_b32 s0, 1
+; CHECK-NEXT: s_branch .LBB35_3
+; CHECK-NEXT: .LBB35_2: ; %if
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_branch .LBB35_3
+; CHECK-NEXT: .LBB35_3:
+ %cmp = icmp ne ptr addrspace(4) @1, null
+ br i1 %cmp, label %endif, label %if
+
+if:
+ ret i32 0
+
+endif:
+ ret i32 1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index a828ee0..7552f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
; CHECK-LABEL: s_uaddo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_add_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
; CHECK-NEXT: s_addc_u32 s0, 1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: s_usubo_pseudo:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_sub_u32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0
; CHECK-NEXT: s_subb_u32 s0, s1, 0
; CHECK-NEXT: ; return to shader part epilog
%pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 5f6d622..71f5a94 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s15, 0, s16
; GCN-NEXT: s_add_u32 s16, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s15
; GCN-NEXT: s_mul_i32 s0, s12, s14
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s15, s16, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s14, s14, s12
; GCN-NEXT: s_ashr_i32 s12, s7, 31
; GCN-NEXT: s_add_u32 s0, s6, s12
@@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_addc_u32 s4, s4, 0
; GCN-NEXT: s_mul_i32 s14, s7, s14
-; GCN-NEXT: s_add_u32 s14, s1, s14
-; GCN-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NEXT: s_add_u32 s16, s1, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s16
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_addc_u32 s15, 0, s4
+; GCN-NEXT: s_addc_u32 s17, 0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mul_i32 s4, s10, s15
+; GCN-NEXT: s_mul_i32 s4, s10, s17
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s5, s11, s14
-; GCN-NEXT: s_add_i32 s16, s4, s5
-; GCN-NEXT: s_sub_i32 s17, s7, s16
-; GCN-NEXT: s_mul_i32 s4, s10, s14
+; GCN-NEXT: s_mul_i32 s5, s11, s16
+; GCN-NEXT: s_add_i32 s18, s4, s5
+; GCN-NEXT: s_sub_i32 s14, s7, s18
+; GCN-NEXT: s_mul_i32 s4, s10, s16
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s18, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_subb_u32 s17, s17, s11
-; GCN-NEXT: s_sub_u32 s19, s6, s10
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s15, s4, s5
+; GCN-NEXT: s_subb_u32 s19, s14, s11
+; GCN-NEXT: s_sub_u32 s20, s6, s10
+; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT: s_or_b32 s14, s14, s15
+; GCN-NEXT: s_subb_u32 s14, s19, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s11
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s20, s10
+; GCN-NEXT: s_cselect_b32 s19, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s11
+; GCN-NEXT: s_cselect_b32 s14, s19, s15
+; GCN-NEXT: s_add_u32 s15, s16, 1
+; GCN-NEXT: s_addc_u32 s19, s17, 0
+; GCN-NEXT: s_add_u32 s20, s16, 2
+; GCN-NEXT: s_addc_u32 s21, s17, 0
+; GCN-NEXT: s_cmp_lg_u32 s14, 0
+; GCN-NEXT: s_cselect_b32 s14, s20, s15
+; GCN-NEXT: s_cselect_b32 s15, s21, s19
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s17, 0
+; GCN-NEXT: s_subb_u32 s4, s7, s18
; GCN-NEXT: s_cmp_ge_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s19, s10
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s4, s11
-; GCN-NEXT: s_cselect_b32 s4, s17, s5
-; GCN-NEXT: s_add_u32 s5, s14, 1
-; GCN-NEXT: s_addc_u32 s17, s15, 0
-; GCN-NEXT: s_add_u32 s19, s14, 2
-; GCN-NEXT: s_addc_u32 s20, s15, 0
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s19, s5
-; GCN-NEXT: s_cselect_b32 s5, s20, s17
-; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s16
-; GCN-NEXT: s_cmp_ge_u32 s7, s11
-; GCN-NEXT: s_cselect_b32 s16, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s10
; GCN-NEXT: s_cselect_b32 s6, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s11
-; GCN-NEXT: s_cselect_b32 s6, s6, s16
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s5, s5, s15
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
+; GCN-NEXT: s_cmp_eq_u32 s4, s11
+; GCN-NEXT: s_cselect_b32 s4, s6, s5
+; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_cselect_b32 s5, s15, s17
+; GCN-NEXT: s_cselect_b32 s4, s14, s16
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_sub_u32 s4, s4, s6
@@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
@@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
-; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s12, 0, s13
; GCN-NEXT: s_add_u32 s13, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s13
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s11, s11, s12
; GCN-NEXT: s_mul_i32 s8, s2, s11
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s13, s2
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s8, s11, s10
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
@@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
; GCN-NEXT: s_add_u32 s8, s10, s8
-; GCN-NEXT: s_addc_u32 s10, 0, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_mul_i32 s8, s7, s10
+; GCN-NEXT: s_mul_i32 s8, s7, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s11, s9, s8
-; GCN-NEXT: s_sub_i32 s12, 0, s11
-; GCN-NEXT: s_mul_i32 s8, s6, s10
-; GCN-NEXT: s_sub_u32 s13, 24, s8
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s12, s12, s7
-; GCN-NEXT: s_sub_u32 s15, s13, s6
+; GCN-NEXT: s_add_i32 s13, s9, s8
+; GCN-NEXT: s_sub_i32 s10, 0, s13
+; GCN-NEXT: s_mul_i32 s8, s6, s12
+; GCN-NEXT: s_sub_u32 s14, 24, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s11, s8, s9
+; GCN-NEXT: s_subb_u32 s15, s10, s7
+; GCN-NEXT: s_sub_u32 s16, s14, s6
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s15, 0
+; GCN-NEXT: s_cmp_ge_u32 s10, s7
+; GCN-NEXT: s_cselect_b32 s11, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s6
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s10, s7
+; GCN-NEXT: s_cselect_b32 s10, s15, s11
+; GCN-NEXT: s_add_u32 s11, s12, 1
+; GCN-NEXT: s_addc_u32 s15, 0, 0
+; GCN-NEXT: s_add_u32 s16, s12, 2
+; GCN-NEXT: s_addc_u32 s17, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_cselect_b32 s10, s16, s11
+; GCN-NEXT: s_cselect_b32 s11, s17, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_subb_u32 s8, 0, s13
; GCN-NEXT: s_cmp_ge_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s6
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s7
-; GCN-NEXT: s_cselect_b32 s8, s12, s9
-; GCN-NEXT: s_add_u32 s9, s10, 1
-; GCN-NEXT: s_addc_u32 s12, 0, 0
-; GCN-NEXT: s_add_u32 s15, s10, 2
-; GCN-NEXT: s_addc_u32 s16, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s8, s15, s9
-; GCN-NEXT: s_cselect_b32 s9, s16, s12
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s11, 0, s11
-; GCN-NEXT: s_cmp_ge_u32 s11, s7
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s6
+; GCN-NEXT: s_cmp_ge_u32 s14, s6
; GCN-NEXT: s_cselect_b32 s6, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s11, s7
-; GCN-NEXT: s_cselect_b32 s6, s6, s12
+; GCN-NEXT: s_cmp_eq_u32 s8, s7
+; GCN-NEXT: s_cselect_b32 s6, s6, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s7, s9, 0
-; GCN-NEXT: s_cselect_b32 s6, s8, s10
+; GCN-NEXT: s_cselect_b32 s7, s11, 0
+; GCN-NEXT: s_cselect_b32 s6, s10, s12
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_subb_u32 s7, s7, s4
@@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
@@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index bbd1793..e12e31b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
; GCN-NEXT: s_sub_u32 s3, 0, s8
-; GCN-NEXT: s_subb_u32 s12, 0, s9
+; GCN-NEXT: s_subb_u32 s10, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s11, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s15, s3, s10
-; GCN-NEXT: s_mul_i32 s14, s12, s10
-; GCN-NEXT: s_add_i32 s11, s15, s11
-; GCN-NEXT: s_add_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s16, s3, s10
-; GCN-NEXT: s_mul_i32 s15, s10, s11
-; GCN-NEXT: s_mul_hi_u32 s17, s10, s16
-; GCN-NEXT: s_mul_hi_u32 s14, s10, s11
+; GCN-NEXT: v_readfirstlane_b32 s11, v1
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s13, s3, s11
+; GCN-NEXT: s_mul_hi_u32 s15, s3, s12
+; GCN-NEXT: s_mul_i32 s14, s10, s12
+; GCN-NEXT: s_add_i32 s13, s15, s13
+; GCN-NEXT: s_add_i32 s13, s13, s14
+; GCN-NEXT: s_mul_i32 s16, s3, s12
+; GCN-NEXT: s_mul_i32 s15, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s12, s16
+; GCN-NEXT: s_mul_hi_u32 s14, s12, s13
; GCN-NEXT: s_add_u32 s15, s17, s15
; GCN-NEXT: s_addc_u32 s14, 0, s14
-; GCN-NEXT: s_mul_hi_u32 s18, s13, s16
-; GCN-NEXT: s_mul_i32 s16, s13, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s11, s16
+; GCN-NEXT: s_mul_i32 s16, s11, s16
; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_mul_hi_u32 s17, s13, s11
+; GCN-NEXT: s_mul_hi_u32 s17, s11, s13
; GCN-NEXT: s_addc_u32 s14, s14, s18
; GCN-NEXT: s_addc_u32 s15, s17, 0
-; GCN-NEXT: s_mul_i32 s11, s13, s11
-; GCN-NEXT: s_add_u32 s11, s14, s11
+; GCN-NEXT: s_mul_i32 s13, s11, s13
+; GCN-NEXT: s_add_u32 s13, s14, s13
; GCN-NEXT: s_addc_u32 s14, 0, s15
-; GCN-NEXT: s_add_u32 s15, s10, s11
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT: s_addc_u32 s13, s13, s14
-; GCN-NEXT: s_mul_i32 s10, s3, s13
-; GCN-NEXT: s_mul_hi_u32 s11, s3, s15
-; GCN-NEXT: s_add_i32 s10, s11, s10
-; GCN-NEXT: s_mul_i32 s12, s12, s15
-; GCN-NEXT: s_add_i32 s10, s10, s12
-; GCN-NEXT: s_mul_i32 s3, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s12, s13, s3
-; GCN-NEXT: s_mul_i32 s14, s13, s3
-; GCN-NEXT: s_mul_i32 s17, s15, s10
-; GCN-NEXT: s_mul_hi_u32 s3, s15, s3
-; GCN-NEXT: s_mul_hi_u32 s16, s15, s10
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s11, s11, s14
+; GCN-NEXT: s_mul_i32 s13, s3, s11
+; GCN-NEXT: s_mul_hi_u32 s14, s3, s12
+; GCN-NEXT: s_add_i32 s13, s14, s13
+; GCN-NEXT: s_mul_i32 s10, s10, s12
+; GCN-NEXT: s_add_i32 s13, s13, s10
+; GCN-NEXT: s_mul_i32 s3, s3, s12
+; GCN-NEXT: s_mul_hi_u32 s14, s11, s3
+; GCN-NEXT: s_mul_i32 s15, s11, s3
+; GCN-NEXT: s_mul_i32 s17, s12, s13
+; GCN-NEXT: s_mul_hi_u32 s3, s12, s3
+; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
; GCN-NEXT: s_add_u32 s3, s3, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_add_u32 s3, s3, s14
-; GCN-NEXT: s_mul_hi_u32 s11, s13, s10
-; GCN-NEXT: s_addc_u32 s3, s16, s12
-; GCN-NEXT: s_addc_u32 s11, s11, 0
-; GCN-NEXT: s_mul_i32 s10, s13, s10
-; GCN-NEXT: s_add_u32 s3, s3, s10
-; GCN-NEXT: s_addc_u32 s12, 0, s11
-; GCN-NEXT: s_add_u32 s3, s15, s3
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT: s_addc_u32 s14, s13, s12
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_mul_hi_u32 s10, s11, s13
+; GCN-NEXT: s_addc_u32 s3, s16, s14
+; GCN-NEXT: s_addc_u32 s10, s10, 0
+; GCN-NEXT: s_mul_i32 s13, s11, s13
+; GCN-NEXT: s_add_u32 s3, s3, s13
+; GCN-NEXT: s_addc_u32 s10, 0, s10
+; GCN-NEXT: s_add_u32 s3, s12, s3
+; GCN-NEXT: s_addc_u32 s14, s11, s10
; GCN-NEXT: s_ashr_i32 s10, s5, 31
; GCN-NEXT: s_add_u32 s12, s4, s10
; GCN-NEXT: s_mov_b32 s11, s10
@@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_mul_i32 s3, s8, s3
; GCN-NEXT: s_sub_u32 s3, s12, s3
; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s12, s16, s9
; GCN-NEXT: s_sub_u32 s18, s3, s8
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s19, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s19, s9
; GCN-NEXT: s_cselect_b32 s20, -1, 0
@@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-NEXT: s_cselect_b32 s20, s21, s20
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s12, s12, s9
-; GCN-NEXT: s_sub_u32 s21, s18, s8
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT: s_sub_u32 s16, s18, s8
; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s20, 0
-; GCN-NEXT: s_cselect_b32 s16, s21, s18
+; GCN-NEXT: s_cselect_b32 s16, s16, s18
; GCN-NEXT: s_cselect_b32 s12, s12, s19
; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
; GCN-NEXT: s_subb_u32 s5, s13, s5
@@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s3, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s3, s3, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s3, s3, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s3, s3, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s5, s13, s5
@@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s9, 0, s6
-; GCN-NEXT: s_subb_u32 s16, 0, s7
+; GCN-NEXT: s_subb_u32 s14, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s17, v1
-; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_mul_i32 s15, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s19, s9, s14
-; GCN-NEXT: s_mul_i32 s18, s16, s14
-; GCN-NEXT: s_add_i32 s15, s19, s15
-; GCN-NEXT: s_add_i32 s15, s15, s18
-; GCN-NEXT: s_mul_i32 s20, s9, s14
-; GCN-NEXT: s_mul_i32 s19, s14, s15
-; GCN-NEXT: s_mul_hi_u32 s21, s14, s20
-; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s17, s9, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s9, s16
+; GCN-NEXT: s_mul_i32 s18, s14, s16
+; GCN-NEXT: s_add_i32 s17, s19, s17
+; GCN-NEXT: s_add_i32 s17, s17, s18
+; GCN-NEXT: s_mul_i32 s20, s9, s16
+; GCN-NEXT: s_mul_i32 s19, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s16, s20
+; GCN-NEXT: s_mul_hi_u32 s18, s16, s17
; GCN-NEXT: s_add_u32 s19, s21, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_mul_hi_u32 s22, s17, s20
-; GCN-NEXT: s_mul_i32 s20, s17, s20
+; GCN-NEXT: s_mul_hi_u32 s22, s15, s20
+; GCN-NEXT: s_mul_i32 s20, s15, s20
; GCN-NEXT: s_add_u32 s19, s19, s20
-; GCN-NEXT: s_mul_hi_u32 s21, s17, s15
+; GCN-NEXT: s_mul_hi_u32 s21, s15, s17
; GCN-NEXT: s_addc_u32 s18, s18, s22
; GCN-NEXT: s_addc_u32 s19, s21, 0
-; GCN-NEXT: s_mul_i32 s15, s17, s15
-; GCN-NEXT: s_add_u32 s15, s18, s15
+; GCN-NEXT: s_mul_i32 s17, s15, s17
+; GCN-NEXT: s_add_u32 s17, s18, s17
; GCN-NEXT: s_addc_u32 s18, 0, s19
-; GCN-NEXT: s_add_u32 s19, s14, s15
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT: s_addc_u32 s17, s17, s18
-; GCN-NEXT: s_mul_i32 s14, s9, s17
-; GCN-NEXT: s_mul_hi_u32 s15, s9, s19
-; GCN-NEXT: s_add_i32 s14, s15, s14
-; GCN-NEXT: s_mul_i32 s16, s16, s19
-; GCN-NEXT: s_add_i32 s14, s14, s16
-; GCN-NEXT: s_mul_i32 s9, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s16, s17, s9
-; GCN-NEXT: s_mul_i32 s18, s17, s9
-; GCN-NEXT: s_mul_i32 s21, s19, s14
-; GCN-NEXT: s_mul_hi_u32 s9, s19, s9
-; GCN-NEXT: s_mul_hi_u32 s20, s19, s14
+; GCN-NEXT: s_add_u32 s16, s16, s17
+; GCN-NEXT: s_addc_u32 s15, s15, s18
+; GCN-NEXT: s_mul_i32 s17, s9, s15
+; GCN-NEXT: s_mul_hi_u32 s18, s9, s16
+; GCN-NEXT: s_add_i32 s17, s18, s17
+; GCN-NEXT: s_mul_i32 s14, s14, s16
+; GCN-NEXT: s_add_i32 s17, s17, s14
+; GCN-NEXT: s_mul_i32 s9, s9, s16
+; GCN-NEXT: s_mul_hi_u32 s18, s15, s9
+; GCN-NEXT: s_mul_i32 s19, s15, s9
+; GCN-NEXT: s_mul_i32 s21, s16, s17
+; GCN-NEXT: s_mul_hi_u32 s9, s16, s9
+; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
; GCN-NEXT: s_add_u32 s9, s9, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_add_u32 s9, s9, s18
-; GCN-NEXT: s_mul_hi_u32 s15, s17, s14
-; GCN-NEXT: s_addc_u32 s9, s20, s16
-; GCN-NEXT: s_addc_u32 s15, s15, 0
-; GCN-NEXT: s_mul_i32 s14, s17, s14
-; GCN-NEXT: s_add_u32 s9, s9, s14
-; GCN-NEXT: s_addc_u32 s16, 0, s15
-; GCN-NEXT: s_add_u32 s9, s19, s9
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT: s_addc_u32 s18, s17, s16
+; GCN-NEXT: s_add_u32 s9, s9, s19
+; GCN-NEXT: s_mul_hi_u32 s14, s15, s17
+; GCN-NEXT: s_addc_u32 s9, s20, s18
+; GCN-NEXT: s_addc_u32 s14, s14, 0
+; GCN-NEXT: s_mul_i32 s17, s15, s17
+; GCN-NEXT: s_add_u32 s9, s9, s17
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: s_add_u32 s9, s16, s9
+; GCN-NEXT: s_addc_u32 s18, s15, s14
; GCN-NEXT: s_ashr_i32 s14, s11, 31
; GCN-NEXT: s_add_u32 s16, s10, s14
; GCN-NEXT: s_mov_b32 s15, s14
@@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s6, s9
; GCN-NEXT: s_sub_u32 s9, s16, s9
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s16, s20, s7
; GCN-NEXT: s_sub_u32 s22, s9, s6
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s23, s16, 0
; GCN-NEXT: s_cmp_ge_u32 s23, s7
; GCN-NEXT: s_cselect_b32 s24, -1, 0
@@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s24, s25, s24
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s16, s16, s7
-; GCN-NEXT: s_sub_u32 s25, s22, s6
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT: s_sub_u32 s20, s22, s6
; GCN-NEXT: s_subb_u32 s16, s16, 0
; GCN-NEXT: s_cmp_lg_u32 s24, 0
-; GCN-NEXT: s_cselect_b32 s20, s25, s22
+; GCN-NEXT: s_cselect_b32 s20, s20, s22
; GCN-NEXT: s_cselect_b32 s16, s16, s23
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s11, s17, s11
@@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s14, 0, s11
+; GCN-NEXT: s_subb_u32 s12, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT: s_mul_i32 s16, s14, s12
-; GCN-NEXT: s_add_i32 s13, s17, s13
-; GCN-NEXT: s_add_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s12
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT: s_mul_i32 s16, s12, s14
+; GCN-NEXT: s_add_i32 s15, s17, s15
+; GCN-NEXT: s_add_i32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s14
+; GCN-NEXT: s_mul_i32 s17, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT: s_mul_i32 s18, s15, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT: s_mul_i32 s18, s13, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s13, s15, s13
-; GCN-NEXT: s_add_u32 s13, s16, s13
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s15, s16, s15
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s17, s12, s13
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s12, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT: s_add_i32 s12, s13, s12
-; GCN-NEXT: s_mul_i32 s14, s14, s17
-; GCN-NEXT: s_add_i32 s12, s12, s14
-; GCN-NEXT: s_mul_i32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT: s_mul_i32 s16, s15, s3
-; GCN-NEXT: s_mul_i32 s19, s17, s12
-; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT: s_add_i32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s12, s12, s14
+; GCN-NEXT: s_add_i32 s15, s15, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s13, s3
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s16
-; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT: s_addc_u32 s3, s18, s14
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s12, s15, s12
-; GCN-NEXT: s_add_u32 s3, s3, s12
-; GCN-NEXT: s_addc_u32 s14, 0, s13
-; GCN-NEXT: s_add_u32 s3, s17, s3
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s16, s15, s14
+; GCN-NEXT: s_add_u32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT: s_addc_u32 s3, s18, s16
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: s_add_u32 s3, s14, s3
+; GCN-NEXT: s_addc_u32 s16, s13, s12
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s23, s20, s10
-; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT: s_sub_u32 s18, s20, s10
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s23, s20
+; GCN-NEXT: s_cselect_b32 s18, s18, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v0
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
@@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s17, 0, s6
-; GCN-NEXT: s_subb_u32 s24, 0, s7
+; GCN-NEXT: s_subb_u32 s22, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s25, v1
-; GCN-NEXT: v_readfirstlane_b32 s22, v0
-; GCN-NEXT: s_mul_i32 s23, s17, s25
-; GCN-NEXT: s_mul_hi_u32 s27, s17, s22
-; GCN-NEXT: s_mul_i32 s26, s24, s22
-; GCN-NEXT: s_add_i32 s23, s27, s23
-; GCN-NEXT: s_add_i32 s23, s23, s26
-; GCN-NEXT: s_mul_i32 s28, s17, s22
-; GCN-NEXT: s_mul_i32 s27, s22, s23
-; GCN-NEXT: s_mul_hi_u32 s29, s22, s28
-; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT: v_readfirstlane_b32 s23, v1
+; GCN-NEXT: v_readfirstlane_b32 s24, v0
+; GCN-NEXT: s_mul_i32 s25, s17, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s17, s24
+; GCN-NEXT: s_mul_i32 s26, s22, s24
+; GCN-NEXT: s_add_i32 s25, s27, s25
+; GCN-NEXT: s_add_i32 s25, s25, s26
+; GCN-NEXT: s_mul_i32 s28, s17, s24
+; GCN-NEXT: s_mul_i32 s27, s24, s25
+; GCN-NEXT: s_mul_hi_u32 s29, s24, s28
+; GCN-NEXT: s_mul_hi_u32 s26, s24, s25
; GCN-NEXT: s_add_u32 s27, s29, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_mul_hi_u32 s30, s25, s28
-; GCN-NEXT: s_mul_i32 s28, s25, s28
+; GCN-NEXT: s_mul_hi_u32 s30, s23, s28
+; GCN-NEXT: s_mul_i32 s28, s23, s28
; GCN-NEXT: s_add_u32 s27, s27, s28
-; GCN-NEXT: s_mul_hi_u32 s29, s25, s23
+; GCN-NEXT: s_mul_hi_u32 s29, s23, s25
; GCN-NEXT: s_addc_u32 s26, s26, s30
; GCN-NEXT: s_addc_u32 s27, s29, 0
-; GCN-NEXT: s_mul_i32 s23, s25, s23
-; GCN-NEXT: s_add_u32 s23, s26, s23
+; GCN-NEXT: s_mul_i32 s25, s23, s25
+; GCN-NEXT: s_add_u32 s25, s26, s25
; GCN-NEXT: s_addc_u32 s26, 0, s27
-; GCN-NEXT: s_add_u32 s27, s22, s23
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT: s_addc_u32 s25, s25, s26
-; GCN-NEXT: s_mul_i32 s22, s17, s25
-; GCN-NEXT: s_mul_hi_u32 s23, s17, s27
-; GCN-NEXT: s_add_i32 s22, s23, s22
-; GCN-NEXT: s_mul_i32 s24, s24, s27
-; GCN-NEXT: s_add_i32 s22, s22, s24
-; GCN-NEXT: s_mul_i32 s17, s17, s27
-; GCN-NEXT: s_mul_hi_u32 s24, s25, s17
-; GCN-NEXT: s_mul_i32 s26, s25, s17
-; GCN-NEXT: s_mul_i32 s29, s27, s22
-; GCN-NEXT: s_mul_hi_u32 s17, s27, s17
-; GCN-NEXT: s_mul_hi_u32 s28, s27, s22
+; GCN-NEXT: s_add_u32 s24, s24, s25
+; GCN-NEXT: s_addc_u32 s23, s23, s26
+; GCN-NEXT: s_mul_i32 s25, s17, s23
+; GCN-NEXT: s_mul_hi_u32 s26, s17, s24
+; GCN-NEXT: s_add_i32 s25, s26, s25
+; GCN-NEXT: s_mul_i32 s22, s22, s24
+; GCN-NEXT: s_add_i32 s25, s25, s22
+; GCN-NEXT: s_mul_i32 s17, s17, s24
+; GCN-NEXT: s_mul_hi_u32 s26, s23, s17
+; GCN-NEXT: s_mul_i32 s27, s23, s17
+; GCN-NEXT: s_mul_i32 s29, s24, s25
+; GCN-NEXT: s_mul_hi_u32 s17, s24, s17
+; GCN-NEXT: s_mul_hi_u32 s28, s24, s25
; GCN-NEXT: s_add_u32 s17, s17, s29
; GCN-NEXT: s_addc_u32 s28, 0, s28
-; GCN-NEXT: s_add_u32 s17, s17, s26
-; GCN-NEXT: s_mul_hi_u32 s23, s25, s22
-; GCN-NEXT: s_addc_u32 s17, s28, s24
-; GCN-NEXT: s_addc_u32 s23, s23, 0
-; GCN-NEXT: s_mul_i32 s22, s25, s22
-; GCN-NEXT: s_add_u32 s17, s17, s22
-; GCN-NEXT: s_addc_u32 s24, 0, s23
-; GCN-NEXT: s_add_u32 s17, s27, s17
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT: s_addc_u32 s26, s25, s24
+; GCN-NEXT: s_add_u32 s17, s17, s27
+; GCN-NEXT: s_mul_hi_u32 s22, s23, s25
+; GCN-NEXT: s_addc_u32 s17, s28, s26
+; GCN-NEXT: s_addc_u32 s22, s22, 0
+; GCN-NEXT: s_mul_i32 s25, s23, s25
+; GCN-NEXT: s_add_u32 s17, s17, s25
+; GCN-NEXT: s_addc_u32 s22, 0, s22
+; GCN-NEXT: s_add_u32 s17, s24, s17
+; GCN-NEXT: s_addc_u32 s26, s23, s22
; GCN-NEXT: s_ashr_i32 s22, s19, 31
; GCN-NEXT: s_add_u32 s24, s18, s22
; GCN-NEXT: s_mov_b32 s23, s22
@@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s17, s6, s17
; GCN-NEXT: s_sub_u32 s17, s24, s17
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s24, s28, s7
; GCN-NEXT: s_sub_u32 s30, s17, s6
; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s31, s24, 0
; GCN-NEXT: s_cmp_ge_u32 s31, s7
; GCN-NEXT: s_cselect_b32 s33, -1, 0
@@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s33, s34, s33
; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
; GCN-NEXT: s_subb_u32 s24, s24, s7
-; GCN-NEXT: s_sub_u32 s34, s30, s6
-; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0
+; GCN-NEXT: s_sub_u32 s28, s30, s6
; GCN-NEXT: s_subb_u32 s24, s24, 0
; GCN-NEXT: s_cmp_lg_u32 s33, 0
-; GCN-NEXT: s_cselect_b32 s28, s34, s30
+; GCN-NEXT: s_cselect_b32 s28, s28, s30
; GCN-NEXT: s_cselect_b32 s24, s24, s31
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s19, s25, s19
@@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19
; GCN-NEXT: s_sub_u32 s13, 0, s18
-; GCN-NEXT: s_subb_u32 s22, 0, s19
+; GCN-NEXT: s_subb_u32 s20, 0, s19
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s23, v1
-; GCN-NEXT: v_readfirstlane_b32 s20, v0
-; GCN-NEXT: s_mul_i32 s21, s13, s23
-; GCN-NEXT: s_mul_hi_u32 s25, s13, s20
-; GCN-NEXT: s_mul_i32 s24, s22, s20
-; GCN-NEXT: s_add_i32 s21, s25, s21
-; GCN-NEXT: s_add_i32 s21, s21, s24
-; GCN-NEXT: s_mul_i32 s26, s13, s20
-; GCN-NEXT: s_mul_i32 s25, s20, s21
-; GCN-NEXT: s_mul_hi_u32 s27, s20, s26
-; GCN-NEXT: s_mul_hi_u32 s24, s20, s21
+; GCN-NEXT: v_readfirstlane_b32 s21, v1
+; GCN-NEXT: v_readfirstlane_b32 s22, v0
+; GCN-NEXT: s_mul_i32 s23, s13, s21
+; GCN-NEXT: s_mul_hi_u32 s25, s13, s22
+; GCN-NEXT: s_mul_i32 s24, s20, s22
+; GCN-NEXT: s_add_i32 s23, s25, s23
+; GCN-NEXT: s_add_i32 s23, s23, s24
+; GCN-NEXT: s_mul_i32 s26, s13, s22
+; GCN-NEXT: s_mul_i32 s25, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s27, s22, s26
+; GCN-NEXT: s_mul_hi_u32 s24, s22, s23
; GCN-NEXT: s_add_u32 s25, s27, s25
; GCN-NEXT: s_addc_u32 s24, 0, s24
-; GCN-NEXT: s_mul_hi_u32 s28, s23, s26
-; GCN-NEXT: s_mul_i32 s26, s23, s26
+; GCN-NEXT: s_mul_hi_u32 s28, s21, s26
+; GCN-NEXT: s_mul_i32 s26, s21, s26
; GCN-NEXT: s_add_u32 s25, s25, s26
-; GCN-NEXT: s_mul_hi_u32 s27, s23, s21
+; GCN-NEXT: s_mul_hi_u32 s27, s21, s23
; GCN-NEXT: s_addc_u32 s24, s24, s28
; GCN-NEXT: s_addc_u32 s25, s27, 0
-; GCN-NEXT: s_mul_i32 s21, s23, s21
-; GCN-NEXT: s_add_u32 s21, s24, s21
+; GCN-NEXT: s_mul_i32 s23, s21, s23
+; GCN-NEXT: s_add_u32 s23, s24, s23
; GCN-NEXT: s_addc_u32 s24, 0, s25
-; GCN-NEXT: s_add_u32 s25, s20, s21
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT: s_addc_u32 s23, s23, s24
-; GCN-NEXT: s_mul_i32 s20, s13, s23
-; GCN-NEXT: s_mul_hi_u32 s21, s13, s25
-; GCN-NEXT: s_add_i32 s20, s21, s20
-; GCN-NEXT: s_mul_i32 s22, s22, s25
-; GCN-NEXT: s_add_i32 s20, s20, s22
-; GCN-NEXT: s_mul_i32 s13, s13, s25
-; GCN-NEXT: s_mul_hi_u32 s22, s23, s13
-; GCN-NEXT: s_mul_i32 s24, s23, s13
-; GCN-NEXT: s_mul_i32 s27, s25, s20
-; GCN-NEXT: s_mul_hi_u32 s13, s25, s13
-; GCN-NEXT: s_mul_hi_u32 s26, s25, s20
+; GCN-NEXT: s_add_u32 s22, s22, s23
+; GCN-NEXT: s_addc_u32 s21, s21, s24
+; GCN-NEXT: s_mul_i32 s23, s13, s21
+; GCN-NEXT: s_mul_hi_u32 s24, s13, s22
+; GCN-NEXT: s_add_i32 s23, s24, s23
+; GCN-NEXT: s_mul_i32 s20, s20, s22
+; GCN-NEXT: s_add_i32 s23, s23, s20
+; GCN-NEXT: s_mul_i32 s13, s13, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s21, s13
+; GCN-NEXT: s_mul_i32 s25, s21, s13
+; GCN-NEXT: s_mul_i32 s27, s22, s23
+; GCN-NEXT: s_mul_hi_u32 s13, s22, s13
+; GCN-NEXT: s_mul_hi_u32 s26, s22, s23
; GCN-NEXT: s_add_u32 s13, s13, s27
; GCN-NEXT: s_addc_u32 s26, 0, s26
-; GCN-NEXT: s_add_u32 s13, s13, s24
-; GCN-NEXT: s_mul_hi_u32 s21, s23, s20
-; GCN-NEXT: s_addc_u32 s13, s26, s22
-; GCN-NEXT: s_addc_u32 s21, s21, 0
-; GCN-NEXT: s_mul_i32 s20, s23, s20
-; GCN-NEXT: s_add_u32 s13, s13, s20
-; GCN-NEXT: s_addc_u32 s22, 0, s21
-; GCN-NEXT: s_add_u32 s13, s25, s13
-; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT: s_addc_u32 s24, s23, s22
+; GCN-NEXT: s_add_u32 s13, s13, s25
+; GCN-NEXT: s_mul_hi_u32 s20, s21, s23
+; GCN-NEXT: s_addc_u32 s13, s26, s24
+; GCN-NEXT: s_addc_u32 s20, s20, 0
+; GCN-NEXT: s_mul_i32 s23, s21, s23
+; GCN-NEXT: s_add_u32 s13, s13, s23
+; GCN-NEXT: s_addc_u32 s20, 0, s20
+; GCN-NEXT: s_add_u32 s13, s22, s13
+; GCN-NEXT: s_addc_u32 s24, s21, s20
; GCN-NEXT: s_ashr_i32 s20, s15, 31
; GCN-NEXT: s_add_u32 s22, s14, s20
; GCN-NEXT: s_mov_b32 s21, s20
@@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s13, s18, s13
; GCN-NEXT: s_sub_u32 s13, s22, s13
; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s22, s26, s19
; GCN-NEXT: s_sub_u32 s28, s13, s18
; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s29, s22, 0
; GCN-NEXT: s_cmp_ge_u32 s29, s19
; GCN-NEXT: s_cselect_b32 s30, -1, 0
@@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s30, s31, s30
; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
; GCN-NEXT: s_subb_u32 s22, s22, s19
-; GCN-NEXT: s_sub_u32 s31, s28, s18
-; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0
+; GCN-NEXT: s_sub_u32 s26, s28, s18
; GCN-NEXT: s_subb_u32 s22, s22, 0
; GCN-NEXT: s_cmp_lg_u32 s30, 0
-; GCN-NEXT: s_cselect_b32 s26, s31, s28
+; GCN-NEXT: s_cselect_b32 s26, s26, s28
; GCN-NEXT: s_cselect_b32 s22, s22, s29
; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0
; GCN-NEXT: s_subb_u32 s15, s23, s15
@@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15
; GCN-NEXT: s_sub_u32 s9, 0, s14
-; GCN-NEXT: s_subb_u32 s18, 0, s15
+; GCN-NEXT: s_subb_u32 s16, 0, s15
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s19, v1
-; GCN-NEXT: v_readfirstlane_b32 s16, v0
-; GCN-NEXT: s_mul_i32 s17, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s21, s9, s16
-; GCN-NEXT: s_mul_i32 s20, s18, s16
-; GCN-NEXT: s_add_i32 s17, s21, s17
-; GCN-NEXT: s_add_i32 s17, s17, s20
-; GCN-NEXT: s_mul_i32 s22, s9, s16
-; GCN-NEXT: s_mul_i32 s21, s16, s17
-; GCN-NEXT: s_mul_hi_u32 s23, s16, s22
-; GCN-NEXT: s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s17, v1
+; GCN-NEXT: v_readfirstlane_b32 s18, v0
+; GCN-NEXT: s_mul_i32 s19, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s21, s9, s18
+; GCN-NEXT: s_mul_i32 s20, s16, s18
+; GCN-NEXT: s_add_i32 s19, s21, s19
+; GCN-NEXT: s_add_i32 s19, s19, s20
+; GCN-NEXT: s_mul_i32 s22, s9, s18
+; GCN-NEXT: s_mul_i32 s21, s18, s19
+; GCN-NEXT: s_mul_hi_u32 s23, s18, s22
+; GCN-NEXT: s_mul_hi_u32 s20, s18, s19
; GCN-NEXT: s_add_u32 s21, s23, s21
; GCN-NEXT: s_addc_u32 s20, 0, s20
-; GCN-NEXT: s_mul_hi_u32 s24, s19, s22
-; GCN-NEXT: s_mul_i32 s22, s19, s22
+; GCN-NEXT: s_mul_hi_u32 s24, s17, s22
+; GCN-NEXT: s_mul_i32 s22, s17, s22
; GCN-NEXT: s_add_u32 s21, s21, s22
-; GCN-NEXT: s_mul_hi_u32 s23, s19, s17
+; GCN-NEXT: s_mul_hi_u32 s23, s17, s19
; GCN-NEXT: s_addc_u32 s20, s20, s24
; GCN-NEXT: s_addc_u32 s21, s23, 0
-; GCN-NEXT: s_mul_i32 s17, s19, s17
-; GCN-NEXT: s_add_u32 s17, s20, s17
+; GCN-NEXT: s_mul_i32 s19, s17, s19
+; GCN-NEXT: s_add_u32 s19, s20, s19
; GCN-NEXT: s_addc_u32 s20, 0, s21
-; GCN-NEXT: s_add_u32 s21, s16, s17
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT: s_addc_u32 s19, s19, s20
-; GCN-NEXT: s_mul_i32 s16, s9, s19
-; GCN-NEXT: s_mul_hi_u32 s17, s9, s21
-; GCN-NEXT: s_add_i32 s16, s17, s16
-; GCN-NEXT: s_mul_i32 s18, s18, s21
-; GCN-NEXT: s_add_i32 s16, s16, s18
-; GCN-NEXT: s_mul_i32 s9, s9, s21
-; GCN-NEXT: s_mul_hi_u32 s18, s19, s9
-; GCN-NEXT: s_mul_i32 s20, s19, s9
-; GCN-NEXT: s_mul_i32 s23, s21, s16
-; GCN-NEXT: s_mul_hi_u32 s9, s21, s9
-; GCN-NEXT: s_mul_hi_u32 s22, s21, s16
+; GCN-NEXT: s_add_u32 s18, s18, s19
+; GCN-NEXT: s_addc_u32 s17, s17, s20
+; GCN-NEXT: s_mul_i32 s19, s9, s17
+; GCN-NEXT: s_mul_hi_u32 s20, s9, s18
+; GCN-NEXT: s_add_i32 s19, s20, s19
+; GCN-NEXT: s_mul_i32 s16, s16, s18
+; GCN-NEXT: s_add_i32 s19, s19, s16
+; GCN-NEXT: s_mul_i32 s9, s9, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s17, s9
+; GCN-NEXT: s_mul_i32 s21, s17, s9
+; GCN-NEXT: s_mul_i32 s23, s18, s19
+; GCN-NEXT: s_mul_hi_u32 s9, s18, s9
+; GCN-NEXT: s_mul_hi_u32 s22, s18, s19
; GCN-NEXT: s_add_u32 s9, s9, s23
; GCN-NEXT: s_addc_u32 s22, 0, s22
-; GCN-NEXT: s_add_u32 s9, s9, s20
-; GCN-NEXT: s_mul_hi_u32 s17, s19, s16
-; GCN-NEXT: s_addc_u32 s9, s22, s18
-; GCN-NEXT: s_addc_u32 s17, s17, 0
-; GCN-NEXT: s_mul_i32 s16, s19, s16
-; GCN-NEXT: s_add_u32 s9, s9, s16
-; GCN-NEXT: s_addc_u32 s18, 0, s17
-; GCN-NEXT: s_add_u32 s9, s21, s9
-; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT: s_addc_u32 s20, s19, s18
+; GCN-NEXT: s_add_u32 s9, s9, s21
+; GCN-NEXT: s_mul_hi_u32 s16, s17, s19
+; GCN-NEXT: s_addc_u32 s9, s22, s20
+; GCN-NEXT: s_addc_u32 s16, s16, 0
+; GCN-NEXT: s_mul_i32 s19, s17, s19
+; GCN-NEXT: s_add_u32 s9, s9, s19
+; GCN-NEXT: s_addc_u32 s16, 0, s16
+; GCN-NEXT: s_add_u32 s9, s18, s9
+; GCN-NEXT: s_addc_u32 s20, s17, s16
; GCN-NEXT: s_ashr_i32 s16, s11, 31
; GCN-NEXT: s_add_u32 s18, s10, s16
; GCN-NEXT: s_mov_b32 s17, s16
@@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s9, s14, s9
; GCN-NEXT: s_sub_u32 s9, s18, s9
; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s18, s22, s15
; GCN-NEXT: s_sub_u32 s24, s9, s14
; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s25, s18, 0
; GCN-NEXT: s_cmp_ge_u32 s25, s15
; GCN-NEXT: s_cselect_b32 s26, -1, 0
@@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s26, s27, s26
; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
; GCN-NEXT: s_subb_u32 s18, s18, s15
-; GCN-NEXT: s_sub_u32 s27, s24, s14
-; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT: s_sub_u32 s22, s24, s14
; GCN-NEXT: s_subb_u32 s18, s18, 0
; GCN-NEXT: s_cmp_lg_u32 s26, 0
-; GCN-NEXT: s_cselect_b32 s22, s27, s24
+; GCN-NEXT: s_cselect_b32 s22, s22, s24
; GCN-NEXT: s_cselect_b32 s18, s18, s25
; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0
; GCN-NEXT: s_subb_u32 s11, s19, s11
@@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
; GCN-NEXT: s_sub_u32 s3, 0, s10
-; GCN-NEXT: s_subb_u32 s14, 0, s11
+; GCN-NEXT: s_subb_u32 s12, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_readfirstlane_b32 s15, v1
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s13, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT: s_mul_i32 s16, s14, s12
-; GCN-NEXT: s_add_i32 s13, s17, s13
-; GCN-NEXT: s_add_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s18, s3, s12
-; GCN-NEXT: s_mul_i32 s17, s12, s13
-; GCN-NEXT: s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT: s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT: s_mul_i32 s16, s12, s14
+; GCN-NEXT: s_add_i32 s15, s17, s15
+; GCN-NEXT: s_add_i32 s15, s15, s16
+; GCN-NEXT: s_mul_i32 s18, s3, s14
+; GCN-NEXT: s_mul_i32 s17, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT: s_mul_hi_u32 s16, s14, s15
; GCN-NEXT: s_add_u32 s17, s19, s17
; GCN-NEXT: s_addc_u32 s16, 0, s16
-; GCN-NEXT: s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT: s_mul_i32 s18, s15, s18
+; GCN-NEXT: s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT: s_mul_i32 s18, s13, s18
; GCN-NEXT: s_add_u32 s17, s17, s18
-; GCN-NEXT: s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT: s_mul_hi_u32 s19, s13, s15
; GCN-NEXT: s_addc_u32 s16, s16, s20
; GCN-NEXT: s_addc_u32 s17, s19, 0
-; GCN-NEXT: s_mul_i32 s13, s15, s13
-; GCN-NEXT: s_add_u32 s13, s16, s13
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s15, s16, s15
; GCN-NEXT: s_addc_u32 s16, 0, s17
-; GCN-NEXT: s_add_u32 s17, s12, s13
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s15, s15, s16
-; GCN-NEXT: s_mul_i32 s12, s3, s15
-; GCN-NEXT: s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT: s_add_i32 s12, s13, s12
-; GCN-NEXT: s_mul_i32 s14, s14, s17
-; GCN-NEXT: s_add_i32 s12, s12, s14
-; GCN-NEXT: s_mul_i32 s3, s3, s17
-; GCN-NEXT: s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT: s_mul_i32 s16, s15, s3
-; GCN-NEXT: s_mul_i32 s19, s17, s12
-; GCN-NEXT: s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT: s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s13, s13, s16
+; GCN-NEXT: s_mul_i32 s15, s3, s13
+; GCN-NEXT: s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT: s_add_i32 s15, s16, s15
+; GCN-NEXT: s_mul_i32 s12, s12, s14
+; GCN-NEXT: s_add_i32 s15, s15, s12
+; GCN-NEXT: s_mul_i32 s3, s3, s14
+; GCN-NEXT: s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT: s_mul_i32 s17, s13, s3
+; GCN-NEXT: s_mul_i32 s19, s14, s15
+; GCN-NEXT: s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT: s_mul_hi_u32 s18, s14, s15
; GCN-NEXT: s_add_u32 s3, s3, s19
; GCN-NEXT: s_addc_u32 s18, 0, s18
-; GCN-NEXT: s_add_u32 s3, s3, s16
-; GCN-NEXT: s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT: s_addc_u32 s3, s18, s14
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s12, s15, s12
-; GCN-NEXT: s_add_u32 s3, s3, s12
-; GCN-NEXT: s_addc_u32 s14, 0, s13
-; GCN-NEXT: s_add_u32 s3, s17, s3
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT: s_addc_u32 s16, s15, s14
+; GCN-NEXT: s_add_u32 s3, s3, s17
+; GCN-NEXT: s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT: s_addc_u32 s3, s18, s16
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s15, s13, s15
+; GCN-NEXT: s_add_u32 s3, s3, s15
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: s_add_u32 s3, s14, s3
+; GCN-NEXT: s_addc_u32 s16, s13, s12
; GCN-NEXT: s_ashr_i32 s12, s5, 31
; GCN-NEXT: s_add_u32 s14, s4, s12
; GCN-NEXT: s_mov_b32 s13, s12
@@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_mul_i32 s3, s10, s3
; GCN-NEXT: s_sub_u32 s3, s14, s3
; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s14, s18, s11
; GCN-NEXT: s_sub_u32 s20, s3, s10
; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s21, s14, 0
; GCN-NEXT: s_cmp_ge_u32 s21, s11
; GCN-NEXT: s_cselect_b32 s22, -1, 0
@@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: s_cselect_b32 s22, s23, s22
; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
; GCN-NEXT: s_subb_u32 s14, s14, s11
-; GCN-NEXT: s_sub_u32 s23, s20, s10
-; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT: s_sub_u32 s18, s20, s10
; GCN-NEXT: s_subb_u32 s14, s14, 0
; GCN-NEXT: s_cmp_lg_u32 s22, 0
-; GCN-NEXT: s_cselect_b32 s18, s23, s20
+; GCN-NEXT: s_cselect_b32 s18, s18, s20
; GCN-NEXT: s_cselect_b32 s14, s14, s21
; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0
; GCN-NEXT: s_subb_u32 s5, s15, s5
@@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_readfirstlane_b32 s14, v8
; TONGA-NEXT: s_sub_u32 s12, s12, s14
; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
; TONGA-NEXT: s_sub_u32 s18, s12, s6
; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s19, s1, 0
; TONGA-NEXT: s_cmp_ge_u32 s19, s7
; TONGA-NEXT: s_cselect_b32 s20, -1, 0
@@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: s_cselect_b32 s20, s21, s20
; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
; TONGA-NEXT: s_subb_u32 s1, s1, s7
-; TONGA-NEXT: s_sub_u32 s21, s18, s6
-; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT: s_sub_u32 s16, s18, s6
; TONGA-NEXT: s_subb_u32 s1, s1, 0
; TONGA-NEXT: s_cmp_lg_u32 s20, 0
-; TONGA-NEXT: s_cselect_b32 s16, s21, s18
+; TONGA-NEXT: s_cselect_b32 s16, s16, s18
; TONGA-NEXT: s_cselect_b32 s1, s1, s19
; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0
; TONGA-NEXT: s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33b0a5d..ea9bb04 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s10, s5, s10
-; GCN-NEXT: s_sub_i32 s11, s7, s10
+; GCN-NEXT: s_add_i32 s12, s5, s10
+; GCN-NEXT: s_sub_i32 s10, s7, s12
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s12, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s13, s6, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s11, s4, s5
+; GCN-NEXT: s_subb_u32 s13, s10, s9
+; GCN-NEXT: s_sub_u32 s14, s6, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s15, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s8
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, s17, s16
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s13, s13, s9
+; GCN-NEXT: s_sub_u32 s17, s14, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s14, s11, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s9
+; GCN-NEXT: s_subb_u32 s4, s7, s12
+; GCN-NEXT: s_cmp_ge_u32 s4, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s8
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s9
-; GCN-NEXT: s_cselect_b32 s15, s15, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s16, s13, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s11, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s5, s16, s13
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s10
-; GCN-NEXT: s_cmp_ge_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s8, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s8, s8, s10
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
-; GCN-NEXT: s_cselect_b32 s5, s5, s6
+; GCN-NEXT: s_cselect_b32 s7, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, s9
+; GCN-NEXT: s_cselect_b32 s5, s7, s5
+; GCN-NEXT: s_cmp_lg_u32 s5, 0
+; GCN-NEXT: s_cselect_b32 s4, s10, s4
+; GCN-NEXT: s_cselect_b32 s5, s11, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s8, s9
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s8, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s9, v0
@@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_add_u32 s11, s14, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: s_addc_u32 s10, s12, s10
; GCN-NEXT: s_ashr_i32 s8, s7, 31
; GCN-NEXT: s_add_u32 s6, s6, s8
@@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_readfirstlane_b32 s12, v0
; GCN-NEXT: s_add_i32 s11, s12, s11
; GCN-NEXT: s_mul_i32 s12, s5, s10
-; GCN-NEXT: s_add_i32 s12, s11, s12
-; GCN-NEXT: s_sub_i32 s13, s7, s12
+; GCN-NEXT: s_add_i32 s14, s11, s12
+; GCN-NEXT: s_sub_i32 s12, s7, s14
; GCN-NEXT: s_mul_i32 s10, s4, s10
; GCN-NEXT: s_sub_u32 s6, s6, s10
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s14, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s13, s13, s5
-; GCN-NEXT: s_sub_u32 s15, s6, s4
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s16, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s5
-; GCN-NEXT: s_cselect_b32 s11, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s15, s4
-; GCN-NEXT: s_cselect_b32 s17, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s16, s5
-; GCN-NEXT: s_cselect_b32 s17, s17, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s13, s13, s5
-; GCN-NEXT: s_sub_u32 s18, s15, s4
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s13, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s12, s5
+; GCN-NEXT: s_sub_u32 s16, s6, s4
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_or_b32 s17, s12, s13
+; GCN-NEXT: s_subb_u32 s17, s15, 0
+; GCN-NEXT: s_cmp_ge_u32 s17, s5
+; GCN-NEXT: s_cselect_b32 s18, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s16, s4
+; GCN-NEXT: s_cselect_b32 s19, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s17, s5
+; GCN-NEXT: s_cselect_b32 s18, s19, s18
+; GCN-NEXT: s_or_b32 s12, s12, s13
+; GCN-NEXT: s_subb_u32 s15, s15, s5
+; GCN-NEXT: s_sub_u32 s19, s16, s4
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_or_b32 s12, s12, s13
+; GCN-NEXT: s_subb_u32 s12, s15, 0
+; GCN-NEXT: s_cmp_lg_u32 s18, 0
+; GCN-NEXT: s_cselect_b32 s13, s19, s16
+; GCN-NEXT: s_cselect_b32 s12, s12, s17
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_subb_u32 s10, s13, 0
-; GCN-NEXT: s_cmp_lg_u32 s17, 0
-; GCN-NEXT: s_cselect_b32 s11, s18, s15
-; GCN-NEXT: s_cselect_b32 s10, s10, s16
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s12
+; GCN-NEXT: s_subb_u32 s7, s7, s14
; GCN-NEXT: s_cmp_ge_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s12, -1, 0
+; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s7, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s12
+; GCN-NEXT: s_cselect_b32 s4, s4, s10
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s10, s7
-; GCN-NEXT: s_cselect_b32 s4, s11, s6
+; GCN-NEXT: s_cselect_b32 s5, s12, s7
+; GCN-NEXT: s_cselect_b32 s4, s13, s6
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9]
; GCN-NEXT: s_sub_u32 s4, s4, s8
; GCN-NEXT: s_subb_u32 s5, s5, s8
@@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s10, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s10, 0
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
@@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_or_b32 s20, s20, s21
-; GCN-IR-NEXT: s_cmp_lg_u32 s20, 0
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
@@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s6, s7
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s6, s2, s9
; GCN-NEXT: v_readfirstlane_b32 s7, v0
@@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s2, s11, s2
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_addc_u32 s6, s9, s8
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
@@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mul_i32 s7, s5, s6
; GCN-NEXT: s_mul_i32 s6, s4, s6
; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_add_i32 s8, s8, s7
-; GCN-NEXT: s_sub_i32 s9, 0, s8
-; GCN-NEXT: s_sub_u32 s10, 24, s6
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s11, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s9, s9, s5
-; GCN-NEXT: s_sub_u32 s12, s10, s4
+; GCN-NEXT: s_add_i32 s10, s8, s7
+; GCN-NEXT: s_sub_i32 s8, 0, s10
+; GCN-NEXT: s_sub_u32 s11, 24, s6
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT: s_or_b32 s9, s6, s7
+; GCN-NEXT: s_subb_u32 s12, s8, s5
+; GCN-NEXT: s_sub_u32 s13, s11, s4
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_subb_u32 s14, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s5
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s4
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s5
+; GCN-NEXT: s_cselect_b32 s15, s16, s15
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s12, s12, s5
+; GCN-NEXT: s_sub_u32 s16, s13, s4
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s13, s9, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s5
+; GCN-NEXT: s_subb_u32 s6, 0, s10
+; GCN-NEXT: s_cmp_ge_u32 s6, s5
; GCN-NEXT: s_cselect_b32 s7, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s4
-; GCN-NEXT: s_cselect_b32 s14, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s13, s5
-; GCN-NEXT: s_cselect_b32 s14, s14, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s9, s9, s5
-; GCN-NEXT: s_sub_u32 s15, s12, s4
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_subb_u32 s6, s9, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s7, s15, s12
-; GCN-NEXT: s_cselect_b32 s6, s6, s13
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s8, 0, s8
-; GCN-NEXT: s_cmp_ge_u32 s8, s5
-; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s10, s4
+; GCN-NEXT: s_cmp_ge_u32 s11, s4
; GCN-NEXT: s_cselect_b32 s4, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s5
-; GCN-NEXT: s_cselect_b32 s4, s4, s9
+; GCN-NEXT: s_cmp_eq_u32 s6, s5
+; GCN-NEXT: s_cselect_b32 s4, s4, s7
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s4, s6, s8
-; GCN-NEXT: s_cselect_b32 s5, s7, s10
+; GCN-NEXT: s_cselect_b32 s4, s8, s6
+; GCN-NEXT: s_cselect_b32 s5, s9, s11
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_or_b32 s9, s10, s11
-; GCN-IR-NEXT: s_cmp_lg_u32 s9, 0
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
@@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bb5918b2..bdd22f25 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_addc_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
; VI-NEXT: s_addc_u32 s3, s3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s2, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_addc_u32 s4, s3, s7
+; GFX9-NEXT: s_add_u32 s4, s2, s6
+; GFX9-NEXT: s_addc_u32 s5, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s2, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_addc_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_add_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
-; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_addc_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_addc_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_add_u32 s0, s12, s14
+; GFX9-NEXT: s_addc_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s12, s14
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 41199b0..fd461ac 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
@@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s8, 0, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: s_addc_u32 s10, 0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mul_i32 s0, s3, s8
+; GCN-NEXT: s_mul_i32 s0, s3, s10
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s9, s1, s0
-; GCN-NEXT: s_sub_i32 s10, 0, s9
-; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s11, 24, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s12, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s13, s11, s2
+; GCN-NEXT: s_add_i32 s11, s1, s0
+; GCN-NEXT: s_sub_i32 s8, 0, s11
+; GCN-NEXT: s_mul_i32 s0, s2, s10
+; GCN-NEXT: s_sub_u32 s12, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s9, s0, s1
+; GCN-NEXT: s_subb_u32 s13, s8, s3
+; GCN-NEXT: s_sub_u32 s14, s12, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s8, s3
+; GCN-NEXT: s_cselect_b32 s9, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s2
+; GCN-NEXT: s_cselect_b32 s13, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s8, s3
+; GCN-NEXT: s_cselect_b32 s8, s13, s9
+; GCN-NEXT: s_add_u32 s9, s10, 1
+; GCN-NEXT: s_addc_u32 s13, 0, 0
+; GCN-NEXT: s_add_u32 s14, s10, 2
+; GCN-NEXT: s_addc_u32 s15, 0, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s8, s14, s9
+; GCN-NEXT: s_cselect_b32 s9, s15, s13
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s0, s10, 0
+; GCN-NEXT: s_subb_u32 s0, 0, s11
; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s2
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cselect_b32 s2, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s0, s3
-; GCN-NEXT: s_cselect_b32 s0, s10, s1
-; GCN-NEXT: s_add_u32 s1, s8, 1
-; GCN-NEXT: s_addc_u32 s10, 0, 0
-; GCN-NEXT: s_add_u32 s13, s8, 2
-; GCN-NEXT: s_addc_u32 s14, 0, 0
+; GCN-NEXT: s_cselect_b32 s0, s2, s1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cselect_b32 s0, s13, s1
-; GCN-NEXT: s_cselect_b32 s1, s14, s10
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s9, 0, s9
-; GCN-NEXT: s_cmp_ge_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s11, s2
-; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s2, s2, s10
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s1, s1, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, s8
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_cselect_b32 s0, s9, 0
+; GCN-NEXT: s_cselect_b32 s1, s8, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_or_b32 s12, s12, s13
-; GCN-IR-NEXT: s_cmp_lg_u32 s12, 0
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cdcc914..137dc1f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_addc_u32 s13, 0, s14
; GCN-NEXT: s_add_u32 s14, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s12, s12, s13
; GCN-NEXT: s_mul_i32 s0, s10, s12
; GCN-NEXT: v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_add_u32 s11, s14, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_addc_u32 s1, s12, s10
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_readfirstlane_b32 s10, v0
; GCN-NEXT: s_add_i32 s5, s10, s5
; GCN-NEXT: s_mul_i32 s10, s9, s4
-; GCN-NEXT: s_add_i32 s10, s5, s10
-; GCN-NEXT: s_sub_i32 s11, s7, s10
+; GCN-NEXT: s_add_i32 s12, s5, s10
+; GCN-NEXT: s_sub_i32 s10, s7, s12
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s12, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s13, s6, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT: s_or_b32 s11, s4, s5
+; GCN-NEXT: s_subb_u32 s13, s10, s9
+; GCN-NEXT: s_sub_u32 s14, s6, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s15, s10, s11
+; GCN-NEXT: s_subb_u32 s15, s13, 0
+; GCN-NEXT: s_cmp_ge_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s8
+; GCN-NEXT: s_cselect_b32 s17, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s15, s9
+; GCN-NEXT: s_cselect_b32 s16, s17, s16
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s13, s13, s9
+; GCN-NEXT: s_sub_u32 s17, s14, s8
+; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT: s_or_b32 s10, s10, s11
+; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_cmp_lg_u32 s16, 0
+; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s14, s11, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s9
+; GCN-NEXT: s_subb_u32 s4, s7, s12
+; GCN-NEXT: s_cmp_ge_u32 s4, s9
; GCN-NEXT: s_cselect_b32 s5, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s8
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s9
-; GCN-NEXT: s_cselect_b32 s15, s15, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s11, s11, s9
-; GCN-NEXT: s_sub_u32 s16, s13, s8
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_subb_u32 s4, s11, 0
-; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s5, s16, s13
-; GCN-NEXT: s_cselect_b32 s4, s4, s14
-; GCN-NEXT: s_cmp_lg_u32 s12, 0
-; GCN-NEXT: s_subb_u32 s7, s7, s10
-; GCN-NEXT: s_cmp_ge_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s8
-; GCN-NEXT: s_cselect_b32 s8, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s7, s9
-; GCN-NEXT: s_cselect_b32 s8, s8, s10
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s4, s4, s7
-; GCN-NEXT: s_cselect_b32 s5, s5, s6
+; GCN-NEXT: s_cselect_b32 s7, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s4, s9
+; GCN-NEXT: s_cselect_b32 s5, s7, s5
+; GCN-NEXT: s_cmp_lg_u32 s5, 0
+; GCN-NEXT: s_cselect_b32 s4, s10, s4
+; GCN-NEXT: s_cselect_b32 s5, s11, s6
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_or_b32 s8, s8, s9
-; GCN-IR-NEXT: s_cmp_lg_u32 s8, 0
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_or_b32 s18, s18, s19
-; GCN-IR-NEXT: s_cmp_lg_u32 s18, 0
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_addc_u32 s10, 0, s11
; GCN-NEXT: s_add_u32 s11, s4, s5
; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s9, s9, s10
; GCN-NEXT: s_mul_i32 s4, s6, s9
; GCN-NEXT: v_readfirstlane_b32 s5, v0
@@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_add_u32 s8, s11, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_addc_u32 s4, s9, s6
; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
@@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s9, s1, s0
-; GCN-NEXT: s_sub_i32 s10, 0, s9
+; GCN-NEXT: s_add_i32 s10, s1, s0
+; GCN-NEXT: s_sub_i32 s9, 0, s10
; GCN-NEXT: s_mul_i32 s0, s2, s8
-; GCN-NEXT: s_sub_u32 s8, 24, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s11, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s12, s8, s2
+; GCN-NEXT: s_sub_u32 s11, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_or_b32 s8, s0, s1
+; GCN-NEXT: s_subb_u32 s12, s9, s3
+; GCN-NEXT: s_sub_u32 s13, s11, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s14, s8, s9
+; GCN-NEXT: s_subb_u32 s14, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s14, s3
+; GCN-NEXT: s_cselect_b32 s15, -1, 0
+; GCN-NEXT: s_cmp_ge_u32 s13, s2
+; GCN-NEXT: s_cselect_b32 s16, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s14, s3
+; GCN-NEXT: s_cselect_b32 s15, s16, s15
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s12, s12, s3
+; GCN-NEXT: s_sub_u32 s16, s13, s2
+; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT: s_or_b32 s8, s8, s9
+; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_cmp_lg_u32 s15, 0
+; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s13, s10, 0
-; GCN-NEXT: s_cmp_ge_u32 s13, s3
+; GCN-NEXT: s_subb_u32 s0, 0, s10
+; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s2
-; GCN-NEXT: s_cselect_b32 s14, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s13, s3
-; GCN-NEXT: s_cselect_b32 s14, s14, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s10, s10, s3
-; GCN-NEXT: s_sub_u32 s15, s12, s2
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_subb_u32 s0, s10, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s1, s15, s12
-; GCN-NEXT: s_cselect_b32 s0, s0, s13
-; GCN-NEXT: s_cmp_lg_u32 s11, 0
-; GCN-NEXT: s_subb_u32 s9, 0, s9
-; GCN-NEXT: s_cmp_ge_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s10, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s8, s2
+; GCN-NEXT: s_cmp_ge_u32 s11, s2
; GCN-NEXT: s_cselect_b32 s2, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s9, s3
-; GCN-NEXT: s_cselect_b32 s2, s2, s10
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s0, s0, s9
-; GCN-NEXT: s_cselect_b32 s1, s1, s8
+; GCN-NEXT: s_cmp_eq_u32 s0, s3
+; GCN-NEXT: s_cselect_b32 s1, s2, s1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
+; GCN-NEXT: s_cselect_b32 s0, s8, s0
+; GCN-NEXT: s_cselect_b32 s1, s9, s11
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_or_b32 s16, s16, s17
-; GCN-IR-NEXT: s_cmp_lg_u32 s16, 0
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_or_b32 s6, s6, s7
-; GCN-IR-NEXT: s_cmp_lg_u32 s6, 0
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_or_b32 s14, s14, s15
-; GCN-IR-NEXT: s_cmp_lg_u32 s14, 0
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index d67a7b1..e8db647 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: s_or_b32 s0, s0, s1
-; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_subb_u32 s3, s3, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
; VI-NEXT: s_subb_u32 s3, s3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s2, s6
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_subb_u32 s4, s3, s7
+; GFX9-NEXT: s_sub_u32 s4, s2, s6
+; GFX9-NEXT: s_subb_u32 s5, s3, s7
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s2, s2, s6
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_subb_u32 s3, s3, s7
; GFX10-NEXT: s_cselect_b32 s4, -1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s2, s2, s4
-; GFX11-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_subb_u32 s3, s3, s5
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_sub_u32 s4, s4, s6
; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
; SI-NEXT: s_or_b32 s6, s12, s13
-; SI-NEXT: s_cmp_lg_u32 s6, 0
; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_sub_u32 s2, s4, s6
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_subb_u32 s1, s5, s7
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT: s_subb_u32 s0, s5, s7
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: v_mov_b32_e32 v5, s0
-; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, s12, s14
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s0, s13, s15
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_sub_u32 s0, s12, s14
+; GFX9-NEXT: s_subb_u32 s1, s13, s15
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sub_u32 s0, s12, s14
-; GFX10-NEXT: s_cselect_b32 s1, -1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_subb_u32 s1, s13, s15
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_cselect_b32 s0, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
@@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s4, s4, s6
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-NEXT: s_subb_u32 s5, s5, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_cselect_b32 s4, -1, 0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 75db387..28c6b40 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_add_u32 s11, s12, s11
; GFX1032-NEXT: s_addc_u32 s12, 0, s13
; GFX1032-NEXT: s_add_u32 s8, s8, s11
-; GFX1032-NEXT: s_cselect_b32 s11, -1, 0
-; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT: s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT: s_mul_i32 s11, s9, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s12
-; GFX1032-NEXT: s_mul_i32 s10, s10, s8
+; GFX1032-NEXT: s_mul_hi_u32 s11, s9, s8
+; GFX1032-NEXT: s_mul_i32 s12, s9, s8
; GFX1032-NEXT: s_mul_i32 s9, s9, s5
-; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11
-; GFX1032-NEXT: s_add_i32 s9, s13, s9
-; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11
+; GFX1032-NEXT: s_mul_i32 s10, s10, s8
+; GFX1032-NEXT: s_add_i32 s9, s11, s9
+; GFX1032-NEXT: s_mul_i32 s11, s5, s12
; GFX1032-NEXT: s_add_i32 s9, s9, s10
-; GFX1032-NEXT: s_mul_i32 s10, s5, s11
+; GFX1032-NEXT: s_mul_hi_u32 s10, s8, s12
; GFX1032-NEXT: s_mul_i32 s15, s8, s9
; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT: s_add_u32 s12, s12, s15
+; GFX1032-NEXT: s_add_u32 s10, s10, s15
+; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s12
; GFX1032-NEXT: s_addc_u32 s14, 0, s14
-; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9
-; GFX1032-NEXT: s_add_u32 s10, s12, s10
+; GFX1032-NEXT: s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT: s_add_u32 s10, s10, s11
; GFX1032-NEXT: s_mul_i32 s9, s5, s9
; GFX1032-NEXT: s_addc_u32 s10, s14, s13
-; GFX1032-NEXT: s_addc_u32 s11, s11, 0
+; GFX1032-NEXT: s_addc_u32 s11, s12, 0
; GFX1032-NEXT: s_add_u32 s9, s10, s9
; GFX1032-NEXT: s_addc_u32 s10, 0, s11
; GFX1032-NEXT: s_add_u32 s8, s8, s9
-; GFX1032-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8
-; GFX1032-NEXT: s_cmp_lg_u32 s9, 0
-; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8
; GFX1032-NEXT: s_addc_u32 s5, s5, s10
-; GFX1032-NEXT: s_mul_i32 s8, s3, s8
+; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s8
; GFX1032-NEXT: s_mul_i32 s12, s2, s5
-; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5
-; GFX1032-NEXT: s_add_u32 s11, s11, s12
-; GFX1032-NEXT: s_addc_u32 s10, 0, s10
+; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s5
+; GFX1032-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX1032-NEXT: s_mul_i32 s8, s3, s8
+; GFX1032-NEXT: s_add_u32 s9, s9, s12
+; GFX1032-NEXT: s_addc_u32 s11, 0, s11
; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT: s_add_u32 s8, s11, s8
+; GFX1032-NEXT: s_add_u32 s8, s9, s8
; GFX1032-NEXT: s_mul_i32 s5, s3, s5
-; GFX1032-NEXT: s_addc_u32 s8, s10, s9
+; GFX1032-NEXT: s_addc_u32 s8, s11, s10
; GFX1032-NEXT: s_addc_u32 s9, s13, 0
; GFX1032-NEXT: s_add_u32 s5, s8, s5
; GFX1032-NEXT: s_addc_u32 s8, 0, s9
@@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_sub_i32 s11, s3, s9
; GFX1032-NEXT: s_sub_u32 s10, s2, s10
; GFX1032-NEXT: s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT: s_cmp_lg_u32 s12, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, s1
; GFX1032-NEXT: s_sub_u32 s13, s10, s0
-; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1032-NEXT: s_cmp_lg_u32 s14, 0
; GFX1032-NEXT: s_subb_u32 s11, s11, 0
; GFX1032-NEXT: s_cmp_ge_u32 s11, s1
; GFX1032-NEXT: s_cselect_b32 s14, -1, 0
@@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT: s_sub_u32 s9, 0, s0
-; GFX1064-NEXT: s_subb_u32 s10, 0, s1
+; GFX1064-NEXT: s_sub_u32 s8, 0, s0
+; GFX1064-NEXT: s_subb_u32 s9, 0, s1
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX1064-NEXT: v_rcp_f32_e32 v0, v0
; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT: s_mul_i32 s5, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4
-; GFX1064-NEXT: s_mul_i32 s11, s10, s4
-; GFX1064-NEXT: s_add_i32 s5, s12, s5
-; GFX1064-NEXT: s_mul_i32 s13, s9, s4
-; GFX1064-NEXT: s_add_i32 s5, s5, s11
-; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13
-; GFX1064-NEXT: s_mul_i32 s15, s4, s5
-; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT: s_mul_i32 s11, s8, s13
-; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5
+; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT: s_mul_i32 s10, s8, s4
+; GFX1064-NEXT: s_mul_hi_u32 s12, s8, s5
+; GFX1064-NEXT: s_mul_i32 s11, s9, s5
+; GFX1064-NEXT: s_add_i32 s10, s12, s10
+; GFX1064-NEXT: s_mul_i32 s13, s8, s5
+; GFX1064-NEXT: s_add_i32 s10, s10, s11
+; GFX1064-NEXT: s_mul_hi_u32 s12, s5, s13
+; GFX1064-NEXT: s_mul_i32 s15, s5, s10
+; GFX1064-NEXT: s_mul_hi_u32 s14, s4, s13
+; GFX1064-NEXT: s_mul_i32 s11, s4, s13
+; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s10
; GFX1064-NEXT: s_add_u32 s12, s12, s15
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5
+; GFX1064-NEXT: s_mul_hi_u32 s16, s4, s10
; GFX1064-NEXT: s_add_u32 s11, s12, s11
-; GFX1064-NEXT: s_mul_i32 s5, s8, s5
+; GFX1064-NEXT: s_mul_i32 s10, s4, s10
; GFX1064-NEXT: s_addc_u32 s11, s13, s14
; GFX1064-NEXT: s_addc_u32 s12, s16, 0
-; GFX1064-NEXT: s_add_u32 s5, s11, s5
+; GFX1064-NEXT: s_add_u32 s10, s11, s10
; GFX1064-NEXT: s_addc_u32 s11, 0, s12
-; GFX1064-NEXT: s_add_u32 s12, s4, s5
-; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_mul_i32 s4, s9, s12
-; GFX1064-NEXT: s_addc_u32 s8, s8, s11
-; GFX1064-NEXT: s_mul_i32 s10, s10, s12
-; GFX1064-NEXT: s_mul_i32 s9, s9, s8
-; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4
-; GFX1064-NEXT: s_add_i32 s9, s13, s9
-; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4
-; GFX1064-NEXT: s_add_i32 s9, s9, s10
-; GFX1064-NEXT: s_mul_i32 s4, s8, s4
-; GFX1064-NEXT: s_mul_i32 s14, s12, s9
-; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9
-; GFX1064-NEXT: s_add_u32 s5, s5, s14
+; GFX1064-NEXT: s_add_u32 s5, s5, s10
+; GFX1064-NEXT: s_addc_u32 s4, s4, s11
+; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s5
+; GFX1064-NEXT: s_mul_i32 s11, s8, s5
+; GFX1064-NEXT: s_mul_i32 s8, s8, s4
+; GFX1064-NEXT: s_mul_i32 s9, s9, s5
+; GFX1064-NEXT: s_add_i32 s8, s10, s8
+; GFX1064-NEXT: s_mul_i32 s10, s4, s11
+; GFX1064-NEXT: s_add_i32 s8, s8, s9
+; GFX1064-NEXT: s_mul_hi_u32 s9, s5, s11
+; GFX1064-NEXT: s_mul_i32 s14, s5, s8
+; GFX1064-NEXT: s_mul_hi_u32 s13, s5, s8
+; GFX1064-NEXT: s_add_u32 s9, s9, s14
+; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s11
; GFX1064-NEXT: s_addc_u32 s13, 0, s13
-; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9
-; GFX1064-NEXT: s_add_u32 s4, s5, s4
-; GFX1064-NEXT: s_mul_i32 s9, s8, s9
-; GFX1064-NEXT: s_addc_u32 s4, s13, s11
-; GFX1064-NEXT: s_addc_u32 s5, s10, 0
-; GFX1064-NEXT: s_add_u32 s4, s4, s9
-; GFX1064-NEXT: s_addc_u32 s9, 0, s5
-; GFX1064-NEXT: s_add_u32 s10, s12, s4
-; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10
-; GFX1064-NEXT: s_addc_u32 s5, s8, s9
-; GFX1064-NEXT: s_mul_i32 s8, s3, s10
-; GFX1064-NEXT: s_mul_i32 s10, s2, s5
-; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5
-; GFX1064-NEXT: s_add_u32 s10, s11, s10
-; GFX1064-NEXT: s_addc_u32 s9, 0, s9
-; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5
-; GFX1064-NEXT: s_add_u32 s8, s10, s8
+; GFX1064-NEXT: s_mul_hi_u32 s11, s4, s8
+; GFX1064-NEXT: s_add_u32 s9, s9, s10
+; GFX1064-NEXT: s_mul_i32 s8, s4, s8
+; GFX1064-NEXT: s_addc_u32 s9, s13, s12
+; GFX1064-NEXT: s_addc_u32 s10, s11, 0
+; GFX1064-NEXT: s_add_u32 s8, s9, s8
+; GFX1064-NEXT: s_addc_u32 s9, 0, s10
+; GFX1064-NEXT: s_add_u32 s5, s5, s8
+; GFX1064-NEXT: s_addc_u32 s4, s4, s9
+; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s5
+; GFX1064-NEXT: s_mul_i32 s11, s2, s4
+; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s4
+; GFX1064-NEXT: s_mul_hi_u32 s9, s3, s5
; GFX1064-NEXT: s_mul_i32 s5, s3, s5
-; GFX1064-NEXT: s_addc_u32 s4, s9, s4
+; GFX1064-NEXT: s_add_u32 s8, s8, s11
+; GFX1064-NEXT: s_addc_u32 s10, 0, s10
+; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s4
+; GFX1064-NEXT: s_add_u32 s5, s8, s5
+; GFX1064-NEXT: s_mul_i32 s4, s3, s4
+; GFX1064-NEXT: s_addc_u32 s5, s10, s9
; GFX1064-NEXT: s_addc_u32 s8, s12, 0
-; GFX1064-NEXT: s_add_u32 s10, s4, s5
+; GFX1064-NEXT: s_add_u32 s10, s5, s4
; GFX1064-NEXT: s_addc_u32 s11, 0, s8
; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10
; GFX1064-NEXT: s_mul_i32 s5, s0, s11
; GFX1064-NEXT: s_mul_i32 s8, s1, s10
; GFX1064-NEXT: s_add_i32 s4, s4, s5
-; GFX1064-NEXT: s_add_i32 s12, s4, s8
+; GFX1064-NEXT: s_add_i32 s8, s4, s8
; GFX1064-NEXT: s_mul_i32 s4, s0, s10
-; GFX1064-NEXT: s_sub_i32 s8, s3, s12
-; GFX1064-NEXT: s_sub_u32 s13, s2, s4
+; GFX1064-NEXT: s_sub_i32 s9, s3, s8
+; GFX1064-NEXT: s_sub_u32 s12, s2, s4
; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s14, s8, s1
-; GFX1064-NEXT: s_sub_u32 s15, s13, s0
-; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX1064-NEXT: s_subb_u32 s8, s14, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s8, s1
-; GFX1064-NEXT: s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s15, s0
+; GFX1064-NEXT: s_subb_u32 s9, s9, s1
+; GFX1064-NEXT: s_sub_u32 s13, s12, s0
+; GFX1064-NEXT: s_subb_u32 s9, s9, 0
+; GFX1064-NEXT: s_cmp_ge_u32 s9, s1
; GFX1064-NEXT: s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT: s_cmp_eq_u32 s8, s1
-; GFX1064-NEXT: s_cselect_b32 s8, s14, s9
-; GFX1064-NEXT: s_add_u32 s9, s10, 1
+; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT: s_cselect_b32 s13, -1, 0
+; GFX1064-NEXT: s_cmp_eq_u32 s9, s1
+; GFX1064-NEXT: s_cselect_b32 s9, s13, s14
+; GFX1064-NEXT: s_add_u32 s13, s10, 1
; GFX1064-NEXT: s_addc_u32 s14, s11, 0
; GFX1064-NEXT: s_add_u32 s15, s10, 2
; GFX1064-NEXT: s_addc_u32 s16, s11, 0
-; GFX1064-NEXT: s_cmp_lg_u32 s8, 0
-; GFX1064-NEXT: s_cselect_b32 s15, s15, s9
+; GFX1064-NEXT: s_cmp_lg_u32 s9, 0
+; GFX1064-NEXT: s_cselect_b32 s13, s15, s13
; GFX1064-NEXT: s_cselect_b32 s14, s16, s14
; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT: s_subb_u32 s3, s3, s12
+; GFX1064-NEXT: s_subb_u32 s3, s3, s8
; GFX1064-NEXT: s_cmp_ge_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT: s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT: s_cmp_ge_u32 s12, s0
; GFX1064-NEXT: s_cselect_b32 s5, -1, 0
; GFX1064-NEXT: s_cmp_eq_u32 s3, s1
; GFX1064-NEXT: s_cselect_b32 s1, s5, s4
; GFX1064-NEXT: s_cmp_lg_u32 s1, 0
; GFX1064-NEXT: s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT: s_cselect_b32 s4, s15, s10
+; GFX1064-NEXT: s_cselect_b32 s4, s13, s10
; GFX1064-NEXT: s_cbranch_execnz .LBB15_3
; GFX1064-NEXT: .LBB15_2:
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 64d055b..4445383 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
@@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; GISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; GISEL-GFX8-NEXT: s_cmp_lg_u32 s4, 0
; GISEL-GFX8-NEXT: s_cselect_b32 s4, 1, 0
; GISEL-GFX8-NEXT: v_mov_b32_e32 v0, s4
; GISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; GISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; GISEL-GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX942-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s0
; GISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
@@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
; GISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; GISEL-GFX12-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-GFX12-NEXT: s_cselect_b32 s0, 1, 0
; GISEL-GFX12-NEXT: s_wait_alu 0xfffe
; GISEL-GFX12-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
new file mode 100644
index 0000000..df0cbeb
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; This IR is hand-written.
+
+; ModuleID = 'ptr-named-2.ll'
+source_filename = "ptr-named-2.ll"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel-unknown-none"
+
+%struct.TypeExamples = type { i32*, i32, i32, i32* }
+
+@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!21}
+
+; CHECK-BTF: [1] STRUCT 'TypeExamples' size=32 vlen=4
+; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: 'volatile' type_id=4 bits_offset=64
+; CHECK-BTF-NEXT: 'const' type_id=5 bits_offset=128
+; CHECK-BTF-NEXT: 'restrict_ptr' type_id=6 bits_offset=192
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3
+; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3
+; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7
+; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static
+; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1
+; CHECK-BTF-NEXT: type_id=8 offset=0 size=24
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None)
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true)
+!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp")
+!7 = !{}
+!8 = !{!0}
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10)
+!10 = !{!11, !12, !13, !14}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64)
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16)
+!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16)
+!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64)
+!21 = !{!"my hand-written IR"}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
new file mode 100644
index 0000000..675c34e
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+; #![no_std]
+; #![no_main]
+;
+; pub struct MyType {
+; ptr: *const u32,
+; }
+;
+; impl MyType {
+; pub const fn new() -> Self {
+; let ptr = core::ptr::null();
+; Self { ptr }
+; }
+; }
+;
+; unsafe impl Sync for MyType {}
+;
+; #[unsafe(no_mangle)]
+; pub static X: MyType = MyType::new();
+;
+; #[cfg(not(test))]
+; #[panic_handler]
+; fn panic(_info: &core::panic::PanicInfo) -> ! {
+; loop {}
+; }
+; Compilation flag:
+; cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+; llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc
+; llvm-dis ptr-named.bc -o ptr-named.ll
+
+; ModuleID = 'ptr-named.bc'
+source_filename = "1m2uqe50qkwxmo53ydydvou91"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0
+
+!llvm.module.flags = !{!11, !12, !13, !14}
+!llvm.ident = !{!15}
+!llvm.dbg.cu = !{!16}
+
+; CHECK-BTF: [1] STRUCT 'MyType' size=8 vlen=1
+; CHECK-BTF-NEXT: 'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT: type_id=4 offset=0 size=8
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64)
+!2 = !DINamespace(name: "ptr_named", scope: null)
+!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, dwarfAddressSpace: 0)
+!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!10 = !{}
+!11 = !{i32 8, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{i32 7, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"}
+!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None)
+!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named")
+!18 = !{!0}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
index 4f13f47..56798c8 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
@@ -28,6 +28,11 @@ define void @test() {
@llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str)
; CHECK: %"StructuredBuffer<struct.S>" = type { %struct.S }
+ ; StructuredBuffer<float[3][2]>
+ %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0)
+ @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null)
+ ; CHECK: %"StructuredBuffer<float[3][2]>" = type { [3 x [2 x float]] }
+
; ByteAddressBuffer
%byteaddr = call target("dx.RawBuffer", i8, 0, 0)
@llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null)
@@ -40,12 +45,14 @@ define void @test() {
; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer<int32_t>"
; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer<uint32_t3>"
; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer<struct.S>"
+; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer<float[3][2]>"
; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer
; CHECK: !{i32 0, ptr @[[T0]], !"A"
; CHECK: !{i32 1, ptr @[[T1]], !""
; CHECK: !{i32 2, ptr @[[T2]], !""
; CHECK: !{i32 3, ptr @[[S0]], !"SB"
-; CHECK: !{i32 4, ptr @[[B0]], !""
+; CHECK: !{i32 4, ptr @[[S1]], !""
+; CHECK: !{i32 5, ptr @[[B0]], !""
attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
index 2894fff..da0d13d 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
@@ -1,5 +1,5 @@
{
- "entities" : {
+ "Opcodes" : {
"ABS_Fp":[1, 2],
"ADC":[3, 4],
"ADD":[5, 6],
@@ -7,5 +7,21 @@
"ADDPDrr":[9, 10],
"ADDPSrr":[11, 12],
"ADDSDrm":[13, 14]
+ },
+ "CommonOperands": {
+ "Immediate": [0.1, 0.1],
+ "MBB": [0.2, 0.2],
+ "FrameIndex": [0.3, 0.3],
+ "GlobalAddress": [0.4, 0.4]
+ },
+ "PhysicalRegisters": {
+ "GR32": [0.5, 0.5],
+ "GR64": [0.6, 0.6],
+ "XMM": [0.7, 0.7]
+ },
+ "VirtualRegisters": {
+ "GR32": [0.8, 0.8],
+ "GR64": [0.9, 0.9],
+ "XMM": [1.0, 1.0]
}
} \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
index 5de715b..f4b14a4 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
@@ -1,5 +1,5 @@
{
- "entities": {
+ "Opcodes": {
"KILL": [0.1, 0.2, 0.3],
"MOV": [0.4, 0.5, 0.6],
"LEA": [0.7, 0.8, 0.9],
@@ -18,5 +18,21 @@
"POP": [4.6, 4.7, 4.8],
"NOP": [4.9, 5.0, 5.1],
"COPY": [5.2, 5.3, 5.4]
+ },
+ "CommonOperands": {
+ "Immediate": [0.1, 0.1, 0.1],
+ "MBB": [0.2, 0.2, 0.2],
+ "FrameIndex": [0.3, 0.3, 0.3],
+ "GlobalAddress": [0.4, 0.4, 0.4]
+ },
+ "PhysicalRegisters": {
+ "GR32": [0.5, 0.5, 0.5],
+ "GR64": [0.6, 0.6, 0.6],
+ "XMM": [0.7, 0.7, 0.7]
+ },
+ "VirtualRegisters": {
+ "GR32": [0.8, 0.8, 0.8],
+ "GR64": [0.9, 0.9, 0.9],
+ "XMM": [1.0, 1.0, 1.0]
}
} \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
index bf04163..6274fb7 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
@@ -1,7 +1,16 @@
{
- "entities": {
+ "Opcodes": {
"ADD": [1.0, 2.0, 3.0],
"SUB": [1.5],
"MUL": [2.0, 3.0]
+ },
+ "CommonOperands": {
+ "Immediate": [1.0]
+ },
+ "PhysicalRegisters": {
+ "GR32": [1.0, 2.0]
+ },
+ "VirtualRegisters": {
+ "GR32": [1.0, 2.0, 3.0]
}
}
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
index 63e8ccbd..7bfdf3b 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
@@ -1,5 +1,5 @@
{
- "entities": {
+ "Opcodes": {
"ADD": [],
"SUB": [],
"MUL": [],
@@ -8,5 +8,14 @@
"JMP": [],
"CALL": [],
"RET": []
+ },
+ "CommonOperands": {
+ "Immediate": []
+ },
+ "PhysicalRegisters": {
+ "GR32": []
+ },
+ "VirtualRegisters": {
+ "GR32": []
}
} \ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 6327cff..d3c0da9 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ]
Key: XSTORE: [ 0.00 0.00 ]
Key: XSUSLDTRK: [ 0.00 0.00 ]
Key: XTEST: [ 0.00 0.00 ]
+Key: Immediate: [ 0.10 0.10 ]
+Key: CImmediate: [ 0.00 0.00 ]
+Key: FPImmediate: [ 0.00 0.00 ]
+Key: MBB: [ 0.20 0.20 ]
+Key: FrameIndex: [ 0.30 0.30 ]
+Key: ConstantPoolIndex: [ 0.00 0.00 ]
+Key: TargetIndex: [ 0.00 0.00 ]
+Key: JumpTableIndex: [ 0.00 0.00 ]
+Key: ExternalSymbol: [ 0.00 0.00 ]
+Key: GlobalAddress: [ 0.40 0.40 ]
+Key: BlockAddress: [ 0.00 0.00 ]
+Key: RegisterMask: [ 0.00 0.00 ]
+Key: RegisterLiveOut: [ 0.00 0.00 ]
+Key: Metadata: [ 0.00 0.00 ]
+Key: MCSymbol: [ 0.00 0.00 ]
+Key: CFIIndex: [ 0.00 0.00 ]
+Key: IntrinsicID: [ 0.00 0.00 ]
+Key: Predicate: [ 0.00 0.00 ]
+Key: ShuffleMask: [ 0.00 0.00 ]
+Key: PhyReg_GR8: [ 0.00 0.00 ]
+Key: PhyReg_GRH8: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: PhyReg_GRH16: [ 0.00 0.00 ]
+Key: PhyReg_GR16: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK1: [ 0.00 0.00 ]
+Key: PhyReg_VK16: [ 0.00 0.00 ]
+Key: PhyReg_VK2: [ 0.00 0.00 ]
+Key: PhyReg_VK4: [ 0.00 0.00 ]
+Key: PhyReg_VK8: [ 0.00 0.00 ]
+Key: PhyReg_VK16WM: [ 0.00 0.00 ]
+Key: PhyReg_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_VK2WM: [ 0.00 0.00 ]
+Key: PhyReg_VK4WM: [ 0.00 0.00 ]
+Key: PhyReg_VK8WM: [ 0.00 0.00 ]
+Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_FPCCR: [ 0.00 0.00 ]
+Key: PhyReg_FR16X: [ 0.00 0.00 ]
+Key: PhyReg_FR16: [ 0.00 0.00 ]
+Key: PhyReg_VK16PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK2PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK4PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK8PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_FR32X: [ 0.00 0.00 ]
+Key: PhyReg_GR32: [ 0.50 0.50 ]
+Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_RFP32: [ 0.00 0.00 ]
+Key: PhyReg_VK32WM: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_CCR: [ 0.00 0.00 ]
+Key: PhyReg_DFCCR: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_RFP64: [ 0.00 0.00 ]
+Key: PhyReg_GR64: [ 0.60 0.60 ]
+Key: PhyReg_FR64X: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK64: [ 0.00 0.00 ]
+Key: PhyReg_VR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_VK64WM: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_A: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_RST: [ 0.00 0.00 ]
+Key: PhyReg_RFP80: [ 0.00 0.00 ]
+Key: PhyReg_RFP80_7: [ 0.00 0.00 ]
+Key: PhyReg_VR128X: [ 0.00 0.00 ]
+Key: PhyReg_VR128: [ 0.00 0.00 ]
+Key: PhyReg_VR256X: [ 0.00 0.00 ]
+Key: PhyReg_VR256: [ 0.00 0.00 ]
+Key: PhyReg_VR512: [ 0.00 0.00 ]
+Key: PhyReg_VR512_0_15: [ 0.00 0.00 ]
+Key: PhyReg_TILE: [ 0.00 0.00 ]
+Key: PhyReg_TILEPAIR: [ 0.00 0.00 ]
+Key: VirtReg_GR8: [ 0.00 0.00 ]
+Key: VirtReg_GRH8: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: VirtReg_GRH16: [ 0.00 0.00 ]
+Key: VirtReg_GR16: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK1: [ 0.00 0.00 ]
+Key: VirtReg_VK16: [ 0.00 0.00 ]
+Key: VirtReg_VK2: [ 0.00 0.00 ]
+Key: VirtReg_VK4: [ 0.00 0.00 ]
+Key: VirtReg_VK8: [ 0.00 0.00 ]
+Key: VirtReg_VK16WM: [ 0.00 0.00 ]
+Key: VirtReg_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_VK2WM: [ 0.00 0.00 ]
+Key: VirtReg_VK4WM: [ 0.00 0.00 ]
+Key: VirtReg_VK8WM: [ 0.00 0.00 ]
+Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_FPCCR: [ 0.00 0.00 ]
+Key: VirtReg_FR16X: [ 0.00 0.00 ]
+Key: VirtReg_FR16: [ 0.00 0.00 ]
+Key: VirtReg_VK16PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK2PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK4PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK8PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_FR32X: [ 0.00 0.00 ]
+Key: VirtReg_GR32: [ 0.80 0.80 ]
+Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_RFP32: [ 0.00 0.00 ]
+Key: VirtReg_VK32WM: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_CCR: [ 0.00 0.00 ]
+Key: VirtReg_DFCCR: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_RFP64: [ 0.00 0.00 ]
+Key: VirtReg_GR64: [ 0.90 0.90 ]
+Key: VirtReg_FR64X: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK64: [ 0.00 0.00 ]
+Key: VirtReg_VR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_VK64WM: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_A: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_RST: [ 0.00 0.00 ]
+Key: VirtReg_RFP80: [ 0.00 0.00 ]
+Key: VirtReg_RFP80_7: [ 0.00 0.00 ]
+Key: VirtReg_VR128X: [ 0.00 0.00 ]
+Key: VirtReg_VR128: [ 0.00 0.00 ]
+Key: VirtReg_VR256X: [ 0.00 0.00 ]
+Key: VirtReg_VR256: [ 0.00 0.00 ]
+Key: VirtReg_VR512: [ 0.00 0.00 ]
+Key: VirtReg_VR512_0_15: [ 0.00 0.00 ]
+Key: VirtReg_TILE: [ 0.00 0.00 ]
+Key: VirtReg_TILEPAIR: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index 4409e6d..c6e5508 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA: [ 0.00 0.00 ]
Key: XSTORE: [ 0.00 0.00 ]
Key: XSUSLDTRK: [ 0.00 0.00 ]
Key: XTEST: [ 0.00 0.00 ]
+Key: Immediate: [ 0.10 0.10 ]
+Key: CImmediate: [ 0.00 0.00 ]
+Key: FPImmediate: [ 0.00 0.00 ]
+Key: MBB: [ 0.20 0.20 ]
+Key: FrameIndex: [ 0.30 0.30 ]
+Key: ConstantPoolIndex: [ 0.00 0.00 ]
+Key: TargetIndex: [ 0.00 0.00 ]
+Key: JumpTableIndex: [ 0.00 0.00 ]
+Key: ExternalSymbol: [ 0.00 0.00 ]
+Key: GlobalAddress: [ 0.40 0.40 ]
+Key: BlockAddress: [ 0.00 0.00 ]
+Key: RegisterMask: [ 0.00 0.00 ]
+Key: RegisterLiveOut: [ 0.00 0.00 ]
+Key: Metadata: [ 0.00 0.00 ]
+Key: MCSymbol: [ 0.00 0.00 ]
+Key: CFIIndex: [ 0.00 0.00 ]
+Key: IntrinsicID: [ 0.00 0.00 ]
+Key: Predicate: [ 0.00 0.00 ]
+Key: ShuffleMask: [ 0.00 0.00 ]
+Key: PhyReg_GR8: [ 0.00 0.00 ]
+Key: PhyReg_GRH8: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: PhyReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: PhyReg_GRH16: [ 0.00 0.00 ]
+Key: PhyReg_GR16: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK1: [ 0.00 0.00 ]
+Key: PhyReg_VK16: [ 0.00 0.00 ]
+Key: PhyReg_VK2: [ 0.00 0.00 ]
+Key: PhyReg_VK4: [ 0.00 0.00 ]
+Key: PhyReg_VK8: [ 0.00 0.00 ]
+Key: PhyReg_VK16WM: [ 0.00 0.00 ]
+Key: PhyReg_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_VK2WM: [ 0.00 0.00 ]
+Key: PhyReg_VK4WM: [ 0.00 0.00 ]
+Key: PhyReg_VK8WM: [ 0.00 0.00 ]
+Key: PhyReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: PhyReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_FPCCR: [ 0.00 0.00 ]
+Key: PhyReg_FR16X: [ 0.00 0.00 ]
+Key: PhyReg_FR16: [ 0.00 0.00 ]
+Key: PhyReg_VK16PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK2PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK4PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK8PAIR: [ 0.00 0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_FR32X: [ 0.00 0.00 ]
+Key: PhyReg_GR32: [ 0.50 0.50 ]
+Key: PhyReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK32: [ 0.00 0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_RFP32: [ 0.00 0.00 ]
+Key: PhyReg_VK32WM: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_CCR: [ 0.00 0.00 ]
+Key: PhyReg_DFCCR: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: PhyReg_RFP64: [ 0.00 0.00 ]
+Key: PhyReg_GR64: [ 0.60 0.60 ]
+Key: PhyReg_FR64X: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: PhyReg_FR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_VK64: [ 0.00 0.00 ]
+Key: PhyReg_VR64: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: PhyReg_VK64WM: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_AD: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_A: [ 0.00 0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: PhyReg_RST: [ 0.00 0.00 ]
+Key: PhyReg_RFP80: [ 0.00 0.00 ]
+Key: PhyReg_RFP80_7: [ 0.00 0.00 ]
+Key: PhyReg_VR128X: [ 0.00 0.00 ]
+Key: PhyReg_VR128: [ 0.00 0.00 ]
+Key: PhyReg_VR256X: [ 0.00 0.00 ]
+Key: PhyReg_VR256: [ 0.00 0.00 ]
+Key: PhyReg_VR512: [ 0.00 0.00 ]
+Key: PhyReg_VR512_0_15: [ 0.00 0.00 ]
+Key: PhyReg_TILE: [ 0.00 0.00 ]
+Key: PhyReg_TILEPAIR: [ 0.00 0.00 ]
+Key: VirtReg_GR8: [ 0.00 0.00 ]
+Key: VirtReg_GRH8: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR8_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_H: [ 0.00 0.00 ]
+Key: VirtReg_GR8_ABCD_L: [ 0.00 0.00 ]
+Key: VirtReg_GRH16: [ 0.00 0.00 ]
+Key: VirtReg_GR16: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK1: [ 0.00 0.00 ]
+Key: VirtReg_VK16: [ 0.00 0.00 ]
+Key: VirtReg_VK2: [ 0.00 0.00 ]
+Key: VirtReg_VK4: [ 0.00 0.00 ]
+Key: VirtReg_VK8: [ 0.00 0.00 ]
+Key: VirtReg_VK16WM: [ 0.00 0.00 ]
+Key: VirtReg_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_VK2WM: [ 0.00 0.00 ]
+Key: VirtReg_VK4WM: [ 0.00 0.00 ]
+Key: VirtReg_VK8WM: [ 0.00 0.00 ]
+Key: VirtReg_SEGMENT_REG: [ 0.00 0.00 ]
+Key: VirtReg_GR16_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_FPCCR: [ 0.00 0.00 ]
+Key: VirtReg_FR16X: [ 0.00 0.00 ]
+Key: VirtReg_FR16: [ 0.00 0.00 ]
+Key: VirtReg_VK16PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK2PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK4PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK8PAIR: [ 0.00 0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_FR32X: [ 0.00 0.00 ]
+Key: VirtReg_GR32: [ 0.80 0.80 ]
+Key: VirtReg_GR32_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_DEBUG_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK32: [ 0.00 0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_RFP32: [ 0.00 0.00 ]
+Key: VirtReg_VK32WM: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_CCR: [ 0.00 0.00 ]
+Key: VirtReg_DFCCR: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit: [ 0.00 0.00 ]
+Key: VirtReg_RFP64: [ 0.00 0.00 ]
+Key: VirtReg_GR64: [ 0.90 0.90 ]
+Key: VirtReg_FR64X: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_CONTROL_REG: [ 0.00 0.00 ]
+Key: VirtReg_FR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_VK64: [ 0.00 0.00 ]
+Key: VirtReg_VR64: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit: [ 0.00 0.00 ]
+Key: VirtReg_VK64WM: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX: [ 0.00 0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ABCD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_AD: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_A: [ 0.00 0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI: [ 0.00 0.00 ]
+Key: VirtReg_RST: [ 0.00 0.00 ]
+Key: VirtReg_RFP80: [ 0.00 0.00 ]
+Key: VirtReg_RFP80_7: [ 0.00 0.00 ]
+Key: VirtReg_VR128X: [ 0.00 0.00 ]
+Key: VirtReg_VR128: [ 0.00 0.00 ]
+Key: VirtReg_VR256X: [ 0.00 0.00 ]
+Key: VirtReg_VR256: [ 0.00 0.00 ]
+Key: VirtReg_VR512: [ 0.00 0.00 ]
+Key: VirtReg_VR512_0_15: [ 0.00 0.00 ]
+Key: VirtReg_TILE: [ 0.00 0.00 ]
+Key: VirtReg_TILEPAIR: [ 0.00 0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir
index 5734a23..f2572f5 100644
--- a/llvm/test/CodeGen/MIR2Vec/if-else.mir
+++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir
@@ -135,10 +135,10 @@ body: |
# CHECK: Machine basic block vectors:
# CHECK-NEXT: Machine basic block: abc:entry:
-# CHECK-NEXT: [ 16.50 17.10 17.70 ]
+# CHECK-NEXT: [ 23.60 24.20 24.80 ]
# CHECK-NEXT: Machine basic block: abc:if.then:
-# CHECK-NEXT: [ 4.50 4.80 5.10 ]
+# CHECK-NEXT: [ 7.30 7.60 7.90 ]
# CHECK-NEXT: Machine basic block: abc:if.else:
-# CHECK-NEXT: [ 0.80 1.00 1.20 ]
+# CHECK-NEXT: [ 3.40 3.60 3.80 ]
# CHECK-NEXT: Machine basic block: abc:return:
-# CHECK-NEXT: [ 6.60 6.90 7.20 ] \ No newline at end of file
+# CHECK-NEXT: [ 8.80 9.10 9.40 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
index 338cb63..0fdcc81 100644
--- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
+++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
@@ -48,29 +48,29 @@ body: |
RET 0
# CHECK: MIR2Vec embeddings for machine function add_function:
-# CHECK: Function vector: [ 19.20 19.80 20.40 ]
+# CHECK: Function vector: [ 26.50 27.10 27.70 ]
# CHECK-NEXT: Machine basic block vectors:
# CHECK-NEXT: Machine basic block: add_function:entry:
-# CHECK-NEXT: [ 19.20 19.80 20.40 ]
+# CHECK-NEXT: [ 26.50 27.10 27.70 ]
# CHECK-NEXT: Machine instruction vectors:
# CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi
-# CHECK-NEXT: [ 5.20 5.30 5.40 ]
+# CHECK-NEXT: [ 6.00 6.10 6.20 ]
# CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi
-# CHECK-NEXT: [ 5.20 5.30 5.40 ]
+# CHECK-NEXT: [ 6.00 6.10 6.20 ]
# CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
-# CHECK-NEXT: [ 1.30 1.40 1.50 ]
+# CHECK-NEXT: [ 3.70 3.80 3.90 ]
# CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
-# CHECK-NEXT: [ 1.30 1.40 1.50 ]
+# CHECK-NEXT: [ 3.70 3.80 3.90 ]
# CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32
-# CHECK-NEXT: [ 5.20 5.30 5.40 ]
+# CHECK-NEXT: [ 6.00 6.10 6.20 ]
# CHECK-NEXT: Machine instruction: RET 0, $eax
-# CHECK-NEXT: [ 1.00 1.10 1.20 ]
+# CHECK-NEXT: [ 1.10 1.20 1.30 ]
# CHECK: MIR2Vec embeddings for machine function simple_function:
-# CHECK-NEXT:Function vector: [ 1.00 1.10 1.20 ]
+# CHECK-NEXT:Function vector: [ 1.10 1.20 1.30 ]
# CHECK-NEXT: Machine basic block vectors:
# CHECK-NEXT: Machine basic block: simple_function:entry:
-# CHECK-NEXT: [ 1.00 1.10 1.20 ]
+# CHECK-NEXT: [ 1.10 1.20 1.30 ]
# CHECK-NEXT: Machine instruction vectors:
# CHECK-NEXT: Machine instruction: RET 0
-# CHECK-NEXT: [ 1.00 1.10 1.20 ] \ No newline at end of file
+# CHECK-NEXT: [ 1.10 1.20 1.30 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
index c6554bc..13e908e 100644
--- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
+++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
@@ -10,6 +10,6 @@ define dso_local void @test() {
}
; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
-; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero
-; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file
-; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension
+; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero
+; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file
+; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
index 4d32e66..6d41875 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
@@ -1,5 +1,5 @@
; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
-; Test that uses of cbuffer members inside ConstantExprs are handled correctly.
+; Test that uses of cbuffer members are handled correctly.
; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0
; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0
@@ -37,10 +37,8 @@ entry:
; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]]
; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]]
; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]]
- %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1
-
; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]]
- %load_from_gep = load float, ptr addrspace(12) %gep, align 4
+ %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4
; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]]
%load_v = load <4 x float>, ptr addrspace(12) @v, align 16
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 94efe0f..104ec31 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -5,6 +5,7 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
%struct.TwoInts = type { i32, i32 }
%struct.ThreeInts = type { i32, i32, i32 }
%struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
%struct.ThreeShorts = type { i16, i16, i16 }
%struct.FourShorts = type { i16, i16, i16, i16 }
%struct.FiveShorts = type { i16, i16, i16, i16, i16 }
@@ -12,6 +13,8 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
%struct.ThreeBytes = type { i8, i8, i8 }
%struct.FourBytes = type { i8, i8, i8, i8 }
%struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
; CHECK-LABEL: two_ints_same_op:
; CHECK: loop
@@ -1536,3 +1539,1608 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
34: ; preds = %6, %4
ret void
}
+
+; CHECK-LABEL: two_floats_same_op:
+; CHECK-NOT: f32x4.mul
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp21.not = icmp eq i32 %N, 0
+ br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+ store float %mul, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %mul8, ptr %y10, align 4
+ %inc = add nuw i32 %i.022, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp20.not = icmp eq i32 %N, 0
+ br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+ store float %add, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %sub, ptr %y9, align 4
+ %inc = add nuw i32 %i.021, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp24.not = icmp eq i32 %N, 0
+ br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %inc = add nuw i32 %i.025, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op:
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp23.not = icmp eq i32 %N, 0
+ br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %add = fadd float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+ store float %add, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %sub = fsub float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %sub, ptr %y12, align 4
+ %inc = add nuw i32 %i.024, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp22.not = icmp eq i32 %N, 0
+ br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i8
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv9, ptr %y11, align 1
+ %inc = add nuw i32 %i.023, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp21.not = icmp eq i32 %N, 0
+ br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %conv = fptosi float %add to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %conv8 = fptosi float %sub to i8
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv8, ptr %y10, align 1
+ %inc = add nuw i32 %i.022, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp24.not = icmp eq i32 %N, 0
+ br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %inc = add nuw i32 %i.025, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp23.not = icmp eq i32 %N, 0
+ br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %add = fadd float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+ store float %add, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %sub = fsub float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %sub, ptr %y12, align 4
+ %inc = add nuw i32 %i.024, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp22.not = icmp eq i32 %N, 0
+ br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i16
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv9, ptr %y11, align 2
+ %inc = add nuw i32 %i.023, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp21.not = icmp eq i32 %N, 0
+ br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %conv = fptosi float %add to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %conv8 = fptosi float %sub to i16
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv8, ptr %y10, align 2
+ %inc = add nuw i32 %i.022, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: f32x4.mul
+; CHECK: v128.store
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp45.not = icmp eq i32 %N, 0
+ br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+ store float %mul, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %mul8, ptr %y10, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z13, align 4
+ %mul14 = fmul float %4, %5
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+ store float %mul14, ptr %z16, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w19, align 4
+ %mul20 = fmul float %6, %7
+ %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+ store float %mul20, ptr %w22, align 4
+ %inc = add nuw i32 %i.046, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp42.not = icmp eq i32 %N, 0
+ br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+ store float %add, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %sub, ptr %y9, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z12, align 4
+ %mul = fmul float %4, %5
+ %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+ store float %mul, ptr %z14, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w17, align 4
+ %div = fdiv float %6, %7
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+ store float %div, ptr %w19, align 4
+ %inc = add nuw i32 %i.043, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp52.not = icmp eq i32 %N, 0
+ br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %4 = load i8, ptr %z, align 1
+ %conv15 = sitofp i8 %4 to float
+ %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %5 = load i8, ptr %z17, align 1
+ %conv18 = sitofp i8 %5 to float
+ %mul19 = fmul float %conv15, %conv18
+ %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %mul19, ptr %z21, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+ %6 = load i8, ptr %w, align 1
+ %conv23 = sitofp i8 %6 to float
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+ %7 = load i8, ptr %w25, align 1
+ %conv26 = sitofp i8 %7 to float
+ %mul27 = fmul float %conv23, %conv26
+ %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %mul27, ptr %w29, align 4
+ %inc = add nuw i32 %i.053, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp49.not = icmp eq i32 %N, 0
+ br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %add = fadd float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %add, ptr %y12, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %4 = load i8, ptr %z, align 1
+ %conv14 = sitofp i8 %4 to float
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %5 = load i8, ptr %z16, align 1
+ %conv17 = sitofp i8 %5 to float
+ %div = fdiv float %conv14, %conv17
+ %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %div, ptr %z19, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+ %6 = load i8, ptr %w, align 1
+ %conv21 = sitofp i8 %6 to float
+ %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+ %7 = load i8, ptr %w23, align 1
+ %conv24 = sitofp i8 %7 to float
+ %sub = fsub float %conv21, %conv24
+ %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %sub, ptr %w26, align 4
+ %inc = add nuw i32 %i.050, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp48.not = icmp eq i32 %N, 0
+ br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i8
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv9, ptr %y11, align 1
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z14, align 4
+ %mul15 = fmul float %4, %5
+ %conv16 = fptosi float %mul15 to i8
+ %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i8 %conv16, ptr %z18, align 1
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w21, align 4
+ %mul22 = fmul float %6, %7
+ %conv23 = fptosi float %mul22 to i8
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+ store i8 %conv23, ptr %w25, align 1
+ %inc = add nuw i32 %i.049, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp45.not = icmp eq i32 %N, 0
+ br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %add = fadd float %2, %3
+ %conv8 = fptosi float %add to i8
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv8, ptr %y10, align 1
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z13, align 4
+ %div = fdiv float %4, %5
+ %conv14 = fptosi float %div to i8
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i8 %conv14, ptr %z16, align 1
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w19, align 4
+ %sub = fsub float %6, %7
+ %conv20 = fptosi float %sub to i8
+ %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+ store i8 %conv20, ptr %w22, align 1
+ %inc = add nuw i32 %i.046, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp52.not = icmp eq i32 %N, 0
+ br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %4 = load i16, ptr %z, align 2
+ %conv15 = sitofp i16 %4 to float
+ %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %5 = load i16, ptr %z17, align 2
+ %conv18 = sitofp i16 %5 to float
+ %mul19 = fmul float %conv15, %conv18
+ %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %mul19, ptr %z21, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+ %6 = load i16, ptr %w, align 2
+ %conv23 = sitofp i16 %6 to float
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+ %7 = load i16, ptr %w25, align 2
+ %conv26 = sitofp i16 %7 to float
+ %mul27 = fmul float %conv23, %conv26
+ %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %mul27, ptr %w29, align 4
+ %inc = add nuw i32 %i.053, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp49.not = icmp eq i32 %N, 0
+ br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %add = fadd float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %add, ptr %y12, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %4 = load i16, ptr %z, align 2
+ %conv14 = sitofp i16 %4 to float
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %5 = load i16, ptr %z16, align 2
+ %conv17 = sitofp i16 %5 to float
+ %div = fdiv float %conv14, %conv17
+ %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %div, ptr %z19, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+ %6 = load i16, ptr %w, align 2
+ %conv21 = sitofp i16 %6 to float
+ %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+ %7 = load i16, ptr %w23, align 2
+ %conv24 = sitofp i16 %7 to float
+ %sub = fsub float %conv21, %conv24
+ %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %sub, ptr %w26, align 4
+ %inc = add nuw i32 %i.050, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp48.not = icmp eq i32 %N, 0
+ br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i16
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv9, ptr %y11, align 2
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z14, align 4
+ %mul15 = fmul float %4, %5
+ %conv16 = fptosi float %mul15 to i16
+ %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store i16 %conv16, ptr %z18, align 2
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w21, align 4
+ %mul22 = fmul float %6, %7
+ %conv23 = fptosi float %mul22 to i16
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+ store i16 %conv23, ptr %w25, align 2
+ %inc = add nuw i32 %i.049, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp45.not = icmp eq i32 %N, 0
+ br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %add = fadd float %2, %3
+ %conv8 = fptosi float %add to i16
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv8, ptr %y10, align 2
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z13, align 4
+ %div = fdiv float %4, %5
+ %conv14 = fptosi float %div to i16
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store i16 %conv14, ptr %z16, align 2
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w19, align 4
+ %sub = fsub float %6, %7
+ %conv20 = fptosi float %sub to i16
+ %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+ store i16 %conv20, ptr %w22, align 2
+ %inc = add nuw i32 %i.046, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
new file mode 100644
index 0000000..45f4ddd
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fmaxnum and fmaximumnum get transformed to relaxed_max
+
+target triple = "wasm32"
+
+define <4 x float> @test_maxnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maxnum_f32x4:
+; CHECK: .functype test_maxnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %result = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_maximumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maximumnum_f32x4:
+; CHECK: .functype test_maximumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b)
+ ret <4 x float> %result
+}
+
+define <2 x double> @test_maxnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_maxnum_f64x2:
+; CHECK: .functype test_maxnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %result = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
+ ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK: .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_max
+; CHECK-NEXT: # fallthrough-return
+ %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b)
+ ret <2 x double> %result
+}
+
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.maximumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
new file mode 100644
index 0000000..f3eec02
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fminnum and fminimumnum get transformed to relaxed_min
+
+target triple = "wasm32"
+
+define <4 x float> @test_minnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minnum_f32x4:
+; CHECK: .functype test_minnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %result = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+ ret <4 x float> %result
+}
+
+define <4 x float> @test_minimumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minimumnum_f32x4:
+; CHECK: .functype test_minimumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f32x4.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b)
+ ret <4 x float> %result
+}
+
+define <2 x double> @test_minnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minnum_f64x2:
+; CHECK: .functype test_minnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %result = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
+ ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK: .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: f64x2.relaxed_min
+; CHECK-NEXT: # fallthrough-return
+ %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b)
+ ret <2 x double> %result
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.fminimumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
index 123438d..f58456b 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
@@ -94,6 +94,19 @@ entry:
ret <16 x i8> %0
}
+define <8 x i8> @trunc8i16_8i8(<8 x i16> %a) {
+; CHECK-LABEL: trunc8i16_8i8:
+; CHECK: .functype trunc8i16_8i8 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %0 = trunc <8 x i16> %a to <8 x i8>
+ ret <8 x i8> %0
+}
+
define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; CHECK-LABEL: trunc8i64_8i16:
; CHECK: .functype trunc8i64_8i16 (v128, v128, v128, v128) -> (v128)
@@ -139,3 +152,29 @@ entry:
%0 = trunc <8 x i32> %a to <8 x i16>
ret <8 x i16> %0
}
+
+define <4 x i16> @trunc4i32_4i16(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i16:
+; CHECK: .functype trunc4i32_4i16 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %0 = trunc <4 x i32> %a to <4 x i16>
+ ret <4 x i16> %0
+}
+
+define <4 x i8> @trunc4i32_4i8(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i8:
+; CHECK: .functype trunc4i32_4i8 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0: # %entry
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: # fallthrough-return
+entry:
+ %0 = trunc <4 x i32> %a to <4 x i8>
+ ret <4 x i8> %0
+}
diff --git a/llvm/test/CodeGen/X86/bf16-fast-isel.ll b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
new file mode 100644
index 0000000..c659e0e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --fast-isel < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i8 @test_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_direct_call:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo@PLT
+; CHECK-NEXT: callq bar@PLT
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: retq
+entry:
+ %call = call bfloat @foo(ptr %f)
+ %call2 = call zeroext i8 @bar(bfloat %call)
+ ret i8 %call2
+}
+
+define i8 @test_fast_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_fast_direct_call:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: callq foo_fast@PLT
+; CHECK-NEXT: callq bar@PLT
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: retq
+entry:
+ %call = call fastcc bfloat @foo_fast(ptr %f)
+ %call2 = call zeroext i8 @bar(bfloat %call)
+ ret i8 %call2
+}
+
+define i8 @test_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_indirect_all:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rdi
+; CHECK-NEXT: callq foo@PLT
+; CHECK-NEXT: callq *%rbx
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
+entry:
+ %call = call bfloat @foo(ptr %f)
+ %call2 = call zeroext i8 %fptr(bfloat %call)
+ ret i8 %call2
+}
+
+define i8 @test_fast_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_fast_indirect_all:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movq %rsi, %rdi
+; CHECK-NEXT: callq foo@PLT
+; CHECK-NEXT: callq *%rbx
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
+entry:
+ %call = call fastcc bfloat @foo(ptr %f)
+ %call2 = call zeroext i8 %fptr(bfloat %call)
+ ret i8 %call2
+}
+
+declare bfloat @foo(ptr %f)
+declare zeroext i8 @bar(bfloat)
+declare fastcc bfloat @foo_fast(ptr %f)
diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll
index 659e4dd..27a651e 100644
--- a/llvm/test/CodeGen/X86/fp128-select.ll
+++ b/llvm/test/CodeGen/X86/fp128-select.ll
@@ -13,8 +13,8 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: testl %edx, %edx
; SSE-NEXT: jne .LBB0_1
-; SSE-NEXT: # %bb.3:
-; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: # %bb.2:
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN]
; SSE-NEXT: movaps %xmm0, (%rsi)
; SSE-NEXT: retq
; SSE-NEXT: .LBB0_1:
@@ -58,7 +58,7 @@ define fp128 @test_select_cc(fp128, fp128) nounwind {
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: jmp .LBB1_3
; SSE-NEXT: .LBB1_1:
-; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0]
; SSE-NEXT: .LBB1_3: # %BB0
; SSE-NEXT: testl %ebx, %ebx
; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
new file mode 100644
index 0000000..f6e941a
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+;; Aggressive instcombine merges the two i8 stores into an i16 store. Check
+;; the debug location and DIAssignID metadata get merged.
+
+; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] {
+; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]]
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+; CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8),
+; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]])
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+; CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8),
+; CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]])
+; CHECK-NEXT: ret void
+
+; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]])
+
+define void @test_i16(i16 %x, ptr %p) !dbg !5 {
+ %x.0 = trunc i16 %x to i8
+ store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17
+ #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18)
+ %shr.1 = lshr i16 %x, 8
+ %x.1 = trunc i16 %shr.1 to i8
+ %gep.1 = getelementptr i8, ptr %p, i64 1
+ store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20
+ #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18)
+ ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned)
+!16 = !DILocation(line: 2, column: 1, scope: !5)
+!17 = distinct !DIAssignID()
+!18 = !DILocation(line: 1, column: 1, scope: !5)
+!19 = !DILocation(line: 6, column: 1, scope: !5)
+!20 = distinct !DIAssignID()
diff --git a/llvm/test/Instrumentation/AllocToken/basic.ll b/llvm/test/Instrumentation/AllocToken/basic.ll
index 099d37d..0c34b137 100644
--- a/llvm/test/Instrumentation/AllocToken/basic.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/AllocToken/basic32.ll b/llvm/test/Instrumentation/AllocToken/basic32.ll
index 944a452..52d1d14 100644
--- a/llvm/test/Instrumentation/AllocToken/basic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
diff --git a/llvm/test/Instrumentation/AllocToken/fast.ll b/llvm/test/Instrumentation/AllocToken/fast.ll
index 19a3ef6..f6bf5ee 100644
--- a/llvm/test/Instrumentation/AllocToken/fast.ll
+++ b/llvm/test/Instrumentation/AllocToken/fast.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic.ll b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
index 13aaa90..5c6f2f1 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
index eb5dbbe..15f7c25 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
target triple = "i386-pc-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/invoke.ll b/llvm/test/Instrumentation/AllocToken/invoke.ll
index 347c99a..8e7ab38 100644
--- a/llvm/test/Instrumentation/AllocToken/invoke.ll
+++ b/llvm/test/Instrumentation/AllocToken/invoke.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
index 19673da..45f573e 100644
--- a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
+++ b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-extended -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-extended -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
index 1f77648..4d1be5e 100644
--- a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
+++ b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/LTO/AArch64/TestInputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll
index 7c2a753..7c2a753 100644
--- a/llvm/test/LTO/AArch64/TestInputs/bar.ll
+++ b/llvm/test/LTO/AArch64/Inputs/bar.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll
index e578426..e578426 100644
--- a/llvm/test/LTO/AArch64/TestInputs/fiz.ll
+++ b/llvm/test/LTO/AArch64/Inputs/fiz.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll
index 689d938..689d938 100644
--- a/llvm/test/LTO/AArch64/TestInputs/foo.ll
+++ b/llvm/test/LTO/AArch64/Inputs/foo.ll
diff --git a/llvm/test/LTO/AArch64/TestInputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll
index 2b1758b..2b1758b 100644
--- a/llvm/test/LTO/AArch64/TestInputs/old.ll
+++ b/llvm/test/LTO/AArch64/Inputs/old.ll
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index aef8907..20254de 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -2,7 +2,7 @@
;; be mixed.
;;
; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
; RUN: llvm-lto -exported-symbol main \
; RUN: -exported-symbol foo_on \
; RUN: -filetype=obj \
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
index df6276f..331e481 100644
--- a/llvm/test/LTO/AArch64/link-sign-return-address.ll
+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
@@ -2,10 +2,10 @@
;; be mixed.
;
; RUN: llvm-as %s -o %t1.bc
-; RUN: llvm-as %p/TestInputs/foo.ll -o %t2.bc
-; RUN: llvm-as %p/TestInputs/fiz.ll -o %t3.bc
-; RUN: llvm-as %p/TestInputs/bar.ll -o %t4.bc
-; RUN: llvm-as %p/TestInputs/old.ll -o %t5.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc
+; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc
+; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc
; RUN: llvm-lto -exported-symbol main \
; RUN: -exported-symbol foo_on \
; RUN: -exported-symbol foo_off \
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index 78aa8f2..3faea99 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -20,282 +20,282 @@
//---------------------------------------------------------------------------//
v_fract_f64 v[0:1], 0.5
-// SICI: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0.5 ; encoding: [0xf0,0x7c,0x00,0x7e]
v_sqrt_f64 v[0:1], -4.0
-// SICI: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e]
-// GFX89: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x50,0x00,0x7e]
-// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e]
// GFX11: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX89: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x50,0x00,0x7e]
+// SICI: v_sqrt_f64_e32 v[0:1], -4.0 ; encoding: [0xf7,0x68,0x00,0x7e]
v_log_clamp_f32 v1, 0.5
// NOGFX8PLUS: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// SICI: v_log_clamp_f32_e32 v1, 0.5 ; encoding: [0xf0,0x4c,0x02,0x7e]
v_trunc_f32 v0, 0.5
-// SICI: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0.5 ; encoding: [0xf0,0x42,0x00,0x7e]
v_fract_f64 v[0:1], -1.0
-// SICI: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1.0 ; encoding: [0xf3,0x7c,0x00,0x7e]
v_trunc_f32 v0, -1.0
-// SICI: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1.0 ; encoding: [0xf3,0x42,0x00,0x7e]
v_fract_f64 v[0:1], 4.0
-// SICI: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 4.0 ; encoding: [0xf6,0x7c,0x00,0x7e]
v_trunc_f32 v0, 4.0
-// SICI: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 4.0 ; encoding: [0xf6,0x42,0x00,0x7e]
v_fract_f64 v[0:1], 0.0
-// SICI: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0 ; encoding: [0x80,0x7c,0x00,0x7e]
v_trunc_f32 v0, 0.0
-// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
v_fract_f64 v[0:1], 1.5
-// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
// GFX11: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000 ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
v_trunc_f32 v0, 1.5
-// SICI: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX89: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
// GFX11: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX89: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// SICI: v_trunc_f32_e32 v0, 0x3fc00000 ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
v_fract_f64 v[0:1], -3.1415
-// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
// GFX11: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
// GFX12: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
// GFX1250: v_fract_f64_e32 v[0:1], 0xc00921cac083126f ; encoding: [0xfe,0x7c,0x00,0x7e,0x6f,0x12,0x83,0xc0,0xca,0x21,0x09,0xc0]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
// NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
v_trunc_f32 v0, -3.1415
-// SICI: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX89: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
// GFX11: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX89: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// SICI: v_trunc_f32_e32 v0, 0xc0490e56 ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
v_fract_f64 v[0:1], 100000000000000000000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
// GFX11: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
// GFX12: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
// GFX1250: v_fract_f64_e32 v[0:1], 0x44b52d02c7e14af6 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf6,0x4a,0xe1,0xc7,0x02,0x2d,0xb5,0x44]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02 ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
// NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
v_trunc_f32 v0, 100000000000000000000000.0
-// SICI: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX89: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
// GFX11: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX89: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// SICI: v_trunc_f32_e32 v0, 0x65a96816 ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
v_fract_f64 v[0:1], 10000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
// GFX11: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// SICI: v_fract_f64_e32 v[0:1], 0x416312d0 ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
v_trunc_f32 v0, 10000000.0
-// SICI: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX89: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
// GFX11: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX89: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// SICI: v_trunc_f32_e32 v0, 0x4b189680 ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
v_fract_f64 v[0:1], 3.402823e+38
-// SICI: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
-// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
// GFX11: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
// GFX12: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
// GFX1250: v_fract_f64_e32 v[0:1], 0x47efffff966ad924 ; encoding: [0xfe,0x7c,0x00,0x7e,0x24,0xd9,0x6a,0x96,0xff,0xff,0xef,0x47]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x47efffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
// NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
v_trunc_f32 v0, 3.402823e+38
-// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
// GFX11: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
v_fract_f64 v[0:1], 2.3509886e-38
-// SICI: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
// GFX11: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
// GFX12: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
// GFX1250: v_fract_f64_e32 v[0:1], 0x381fffffe8c9d9fb ; encoding: [0xfe,0x7c,0x00,0x7e,0xfb,0xd9,0xc9,0xe8,0xff,0xff,0x1f,0x38]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x381fffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
// NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
v_trunc_f32 v0, 2.3509886e-38
-// SICI: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
// GFX11: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
+// SICI: v_trunc_f32_e32 v0, 0xffffff ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
v_fract_f64 v[0:1], 2.3509886e-70
-// SICI: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
// GFX11: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
// GFX12: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
// GFX1250: v_fract_f64_e32 v[0:1], 0x3179f623c2d3cf3c ; encoding: [0xfe,0x7c,0x00,0x7e,0x3c,0xcf,0xd3,0xc2,0x23,0xf6,0x79,0x31]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x3179f623 ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
// NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
v_trunc_f32 v0, 2.3509886e-70
// NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
v_fract_f64_e32 v[0:1], 1.0
-// SICI: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1.0 ; encoding: [0xf2,0x7c,0x00,0x7e]
v_fract_f64_e32 v[0:1], lit(1.0)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
// GFX11: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
// GFX12: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
// GFX1250: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cos_f16_e32 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
// GFX11: v_cos_f16_e32 v5.l, 1.0 ; encoding: [0xf2,0xc2,0x0a,0x7e]
// GFX1250: v_cos_f16_e32 v5.l, 1.0 ; encoding: [0xf2,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cos_f16_e32 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
// GFX11: v_cos_f16_e32 v5.l, lit(0x3c00) ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
// GFX1250: v_cos_f16_e32 v5.l, lit(0x3c00) ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_tanh_bf16 v5, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_tanh_bf16_e32 v5, 1.0 ; encoding: [0xf2,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_tanh_bf16 v5, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_tanh_bf16_e32 v5, lit(0x3f80) ; encoding: [0xff,0x94,0x0a,0x7e,0x80,0x3f,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_trunc_f32_e32 v0, 1.0
-// SICI: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1.0 ; encoding: [0xf2,0x42,0x00,0x7e]
v_trunc_f32_e32 v0, lit(1.0)
-// SICI: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
// GFX11: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// SICI: v_trunc_f32_e32 v0, lit(0x3f800000) ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
v_dot2_bf16_bf16 v5.l, v1, v2, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1.0 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_dot2_bf16_bf16 v5.l, v1, v2, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x3f80) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x80,0x3f,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_dot2_f32_f16 v5, v1, 1.0, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_f32_f16 v5, v1, 1.0, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
// GFX12: v_dot2_f32_f16 v5, v1, 1.0, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_dot2_f32_f16 v5, v1, lit(1.0), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
// GFX12: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f16 v1.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_cvt_pk_fp8_f16 v1.l, 0x3c00 ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f16 v1.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x3c00) ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
//---------------------------------------------------------------------------//
// fp literal, expected int operand
@@ -309,118 +309,118 @@ s_mov_b64 s[0:1], lit(0.5)
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 0.5, v1
-// SICI: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0.5, v1 ; encoding: [0xf0,0x02,0x00,0x36]
v_and_b32_e64 v0, 0.5, v1
-// SICI: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
// GFX11: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0.5, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
s_mov_b64_e32 s[0:1], -1.0
// GFX8PLUS: s_mov_b64 s[0:1], -1.0 ; encoding: [0xf3,0x01,0x80,0xbe]
// SICI: s_mov_b64 s[0:1], -1.0 ; encoding: [0xf3,0x04,0x80,0xbe]
v_and_b32_e32 v0, -1.0, v1
-// SICI: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1.0, v1 ; encoding: [0xf3,0x02,0x00,0x36]
v_and_b32_e64 v0, -1.0, v1
-// SICI: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
// GFX11: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -1.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
s_mov_b64_e32 s[0:1], 4.0
// GFX8PLUS: s_mov_b64 s[0:1], 4.0 ; encoding: [0xf6,0x01,0x80,0xbe]
// SICI: s_mov_b64 s[0:1], 4.0 ; encoding: [0xf6,0x04,0x80,0xbe]
v_and_b32_e32 v0, 4.0, v1
-// SICI: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 4.0, v1 ; encoding: [0xf6,0x02,0x00,0x36]
v_and_b32_e64 v0, 4.0, v1
-// SICI: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
// GFX11: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 4.0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
s_mov_b64_e32 s[0:1], 0.0
// GFX8PLUS: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x01,0x80,0xbe]
// SICI: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x04,0x80,0xbe]
v_and_b32_e32 v0, 0.0, v1
-// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
v_and_b32_e64 v0, 0.0, v1
-// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
// GFX11: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
s_mov_b64_e32 s[0:1], 1.5
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 1.5, v1
-// SICI: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
-// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
// GFX11: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
+// SICI: v_and_b32_e32 v0, 0x3fc00000, v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
s_mov_b64_e32 s[0:1], -3.1415
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, -3.1415, v1
-// SICI: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
-// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
// GFX11: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
+// SICI: v_and_b32_e32 v0, 0xc0490e56, v1 ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
s_mov_b64_e32 s[0:1], 100000000000000000000000.0
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 100000000000000000000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
-// GFX89: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
// GFX11: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX89: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
+// SICI: v_and_b32_e32 v0, 0x65a96816, v1 ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
s_mov_b64_e32 s[0:1], 10000000.0
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 10000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
-// GFX89: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
// GFX11: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX89: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
+// SICI: v_and_b32_e32 v0, 0x4b189680, v1 ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
s_mov_b64_e32 s[0:1], 3.402823e+38
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 3.402823e+38, v1
-// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
// GFX11: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
+// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1 ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
s_mov_b64_e32 s[0:1], 2.3509886e-38
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 2.3509886e-38, v1
-// SICI: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
-// GFX89: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
// GFX11: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX89: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
+// SICI: v_and_b32_e32 v0, 0xffffff, v1 ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
s_mov_b64_e32 s[0:1], 2.3509886e-70
// NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
@@ -429,322 +429,322 @@ v_and_b32_e32 v0, 2.3509886e-70, v1
// NOGCN: :[[@LINE-1]]:19: error: invalid operand for instruction
v_not_b16 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_not_b16_e32 v5.l, 1.0 ; encoding: [0xf2,0xd2,0x0a,0x7e]
// GFX1250: v_not_b16_e32 v5.l, 1.0 ; encoding: [0xf2,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_not_b16 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_not_b16_e32 v5.l, lit(0x3f800000) ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
// GFX1250: v_not_b16_e32 v5.l, lit(0x3f800000) ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_and_b32_e32 v0, 1.0, v1
-// SICI: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1.0, v1 ; encoding: [0xf2,0x02,0x00,0x36]
v_and_b32_e32 v0, lit(1.0), v1
-// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
-// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
// GFX11: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
+// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1 ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
v_pk_add_u16 v5, exec_lo, 1.0
+// GFX11: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
// GFX12XX: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0xe4,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1.0 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_pk_add_u16 v5, exec_lo, lit(1.0)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xca,0x03]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x3f800000) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x00,0x00,0x80,0x3f]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
//---------------------------------------------------------------------------//
// int literal, expected fp operand
//---------------------------------------------------------------------------//
v_trunc_f32_e32 v0, 0
-// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0 ; encoding: [0x80,0x42,0x00,0x7e]
v_fract_f64_e32 v[0:1], 1
-// SICI: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1 ; encoding: [0x81,0x7c,0x00,0x7e]
v_fract_f64_e32 v[0:1], lit(1)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
// GFX11: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
// GFX12: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
// GFX1250: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xfe,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x1) ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
v_trunc_f32_e64 v0, 0
-// SICI: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
// GFX11: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 0 ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
v_fract_f64_e64 v[0:1], 0
-// SICI: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
// GFX11: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 0 ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
v_trunc_f32_e32 v0, -13
-// SICI: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -13 ; encoding: [0xcd,0x42,0x00,0x7e]
v_fract_f64_e32 v[0:1], -13
-// SICI: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x7c,0x00,0x7e]
v_trunc_f32_e64 v0, -13
-// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
// GFX11: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
v_fract_f64_e64 v[0:1], -13
-// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
// GFX11: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
v_trunc_f32_e32 v0, 35
-// SICI: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e]
v_fract_f64_e32 v[0:1], 35
-// SICI: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 35 ; encoding: [0xa3,0x7c,0x00,0x7e]
v_trunc_f32_e64 v0, 35
-// SICI: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
// GFX11: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 35 ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
v_fract_f64_e64 v[0:1], 35
-// SICI: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
// GFX11: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 35 ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
v_trunc_f32_e32 v0, 1234
-// SICI: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
// GFX11: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, 0x4d2 ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
v_fract_f64_e32 v[0:1], 1234
-// SICI: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
// GFX11: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], 0x4d2 ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
v_trunc_f32_e64 v0, 1234
+// GFX11: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
// GFX12XX: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:21: error: literal operands are not supported
// NOGFX89: :[[@LINE-3]]:21: error: literal operands are not supported
-// GFX11: v_trunc_f32_e64 v0, 0x4d2 ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
// NOSICIVI: :[[@LINE-1]]:21: error: literal operands are not supported
v_fract_f64_e64 v[0:1], 1234
+// GFX11: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
// GFX12XX: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
// NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x4d2 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
// NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
v_trunc_f32_e32 v0, -54321
-// SICI: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
// GFX11: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_trunc_f32_e32 v0, 0xffff2bcf ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
v_fract_f64_e32 v[0:1], -54321
-// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
// GFX11: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
v_trunc_f32_e32 v0, 0xdeadbeef
-// SICI: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
// GFX11: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_trunc_f32_e32 v0, 0xdeadbeef ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
v_fract_f64_e32 v[0:1], 0xdeadbeef
-// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
// GFX11: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
v_trunc_f32_e32 v0, 0xffffffff
-// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
v_fract_f64_e32 v[0:1], 0xffffffff
-// SICI: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
// GFX11: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffffffff ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
v_trunc_f32_e32 v0, 0x123456789abcdef0
// NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
v_fract_f64_e32 v[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:25: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:25: error: invalid operand for instruction
// GFX1250: v_fract_f64_e32 v[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:25: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:25: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:25: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:25: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:25: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:25: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:25: error: invalid operand for instruction
v_trunc_f32_e32 v0, 0xffffffffffffffff
-// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1 ; encoding: [0xc1,0x42,0x00,0x7e]
v_fract_f64_e32 v[0:1], 0xffffffffffffffff
-// SICI: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e]
// GFX11: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1 ; encoding: [0xc1,0x7c,0x00,0x7e]
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cos_f16_e32 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
// GFX11: v_cos_f16_e32 v5.l, 1 ; encoding: [0x81,0xc2,0x0a,0x7e]
// GFX1250: v_cos_f16_e32 v5.l, 1 ; encoding: [0x81,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cos_f16_e32 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
// GFX11: v_cos_f16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
// GFX1250: v_cos_f16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_tanh_bf16 v5, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_tanh_bf16_e32 v5, 1 ; encoding: [0x81,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_tanh_bf16 v5, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_tanh_bf16_e32 v5, lit(0x1) ; encoding: [0xff,0x94,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_trunc_f32_e32 v0, 1
-// SICI: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1 ; encoding: [0x81,0x42,0x00,0x7e]
v_trunc_f32_e32 v0, lit(1)
-// SICI: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
// GFX11: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, lit(0x1) ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
v_dot2_bf16_bf16 v5.l, v1, v2, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1 ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x06,0x02]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_dot2_bf16_bf16 v5.l, v1, v2, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x1) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_dot2_f32_f16 v5, v1, 1, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_f32_f16 v5, v1, 1, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
// GFX12: v_dot2_f32_f16 v5, v1, 1, v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_dot2_f32_f16 v5, v1, lit(1), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_dot2_f32_f16 v5, v1, lit(0x1), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
// GFX12: v_dot2_f32_f16 v5, v1, lit(0x1), v2 ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f16 v1.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_cvt_pk_fp8_f16 v1.l, 1 ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_cvt_pk_fp8_f16 v1.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x1) ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
//---------------------------------------------------------------------------//
// int literal, expected int operand
@@ -755,111 +755,111 @@ s_mov_b64_e32 s[0:1], 0
// SICI: s_mov_b64 s[0:1], 0 ; encoding: [0x80,0x04,0x80,0xbe]
v_and_b32_e32 v0, 0, v1
-// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1 ; encoding: [0x80,0x02,0x00,0x36]
v_and_b32_e64 v0, 0, v1
-// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
// GFX11: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
s_mov_b64_e32 s[0:1], -13
// GFX8PLUS: s_mov_b64 s[0:1], -13 ; encoding: [0xcd,0x01,0x80,0xbe]
// SICI: s_mov_b64 s[0:1], -13 ; encoding: [0xcd,0x04,0x80,0xbe]
v_and_b32_e32 v0, -13, v1
-// SICI: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -13, v1 ; encoding: [0xcd,0x02,0x00,0x36]
v_and_b32_e64 v0, -13, v1
-// SICI: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
// GFX11: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -13, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
s_mov_b64_e32 s[0:1], 35
// GFX8PLUS: s_mov_b64 s[0:1], 35 ; encoding: [0xa3,0x01,0x80,0xbe]
// SICI: s_mov_b64 s[0:1], 35 ; encoding: [0xa3,0x04,0x80,0xbe]
v_and_b32_e32 v0, 35, v1
-// SICI: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 35, v1 ; encoding: [0xa3,0x02,0x00,0x36]
v_and_b32_e64 v0, 35, v1
-// SICI: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
// GFX11: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 35, v1 ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
s_mov_b64_e32 s[0:1], 1234
// GFX8PLUS: s_mov_b64 s[0:1], 0x4d2 ; encoding: [0xff,0x01,0x80,0xbe,0xd2,0x04,0x00,0x00]
// SICI: s_mov_b64 s[0:1], 0x4d2 ; encoding: [0xff,0x04,0x80,0xbe,0xd2,0x04,0x00,0x00]
v_and_b32_e32 v0, 1234, v1
-// SICI: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
// GFX11: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
+// SICI: v_and_b32_e32 v0, 0x4d2, v1 ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
v_and_b32_e64 v0, 1234, v1
+// GFX11: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
// GFX12XX: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:19: error: literal operands are not supported
// NOGFX89: :[[@LINE-3]]:19: error: literal operands are not supported
-// GFX11: v_and_b32_e64 v0, 0x4d2, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
// NOSICIVI: :[[@LINE-1]]:19: error: literal operands are not supported
s_mov_b64_e32 s[0:1], -54321
-// SICI: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
// GFX11: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
// GFX12: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
// GFX1250: s_mov_b64 s[0:1], 0xffffffffffff2bcf ; encoding: [0xfe,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff,0xff,0xff,0xff,0xff]
+// GFX89: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffff2bcf ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
v_and_b32_e32 v0, -54321, v1
-// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
-// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
// GFX11: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
+// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1 ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
s_mov_b64_e32 s[0:1], 0xdeadbeef
-// SICI: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
-// GFX89: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
// GFX11: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
// GFX12: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
// GFX1250: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xfe,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
+// SICI: s_mov_b64 s[0:1], 0xdeadbeef ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
v_and_b32_e32 v0, 0xdeadbeef, v1
-// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
-// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
// GFX11: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
+// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1 ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
s_mov_b64_e32 s[0:1], 0xffffffff
-// SICI: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
// GFX11: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
// GFX12: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
// GFX1250: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffffffff ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
v_and_b32_e32 v0, 0xffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
s_mov_b64_e32 s[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:23: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:23: error: invalid operand for instruction
// GFX1250: s_mov_b64 s[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x01,0x80,0xbe,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:23: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:23: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:23: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:23: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:23: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:23: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 0x123456789abcdef0, v1
@@ -870,75 +870,75 @@ s_mov_b64_e32 s[0:1], 0xffffffffffffffff
// SICI: s_mov_b64 s[0:1], -1 ; encoding: [0xc1,0x04,0x80,0xbe]
v_and_b32_e32 v0, 0xffffffffffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1 ; encoding: [0xc1,0x02,0x00,0x36]
v_not_b16 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_not_b16_e32 v5.l, 1 ; encoding: [0x81,0xd2,0x0a,0x7e]
// GFX1250: v_not_b16_e32 v5.l, 1 ; encoding: [0x81,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_not_b16 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_not_b16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
// GFX1250: v_not_b16_e32 v5.l, lit(0x1) ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
s_mov_b64 s[0:1], 1
// GFX8PLUS: s_mov_b64 s[0:1], 1 ; encoding: [0x81,0x01,0x80,0xbe]
// SICI: s_mov_b64 s[0:1], 1 ; encoding: [0x81,0x04,0x80,0xbe]
s_mov_b64 s[0:1], lit(1)
-// SICI: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX89: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
// GFX11: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
// GFX12: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
// GFX1250: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
+// SICI: s_mov_b64 s[0:1], lit(0x1) ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
v_and_b32_e32 v0, 1, v1
-// SICI: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1, v1 ; encoding: [0x81,0x02,0x00,0x36]
v_and_b32_e32 v0, lit(1), v1
-// SICI: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
// GFX11: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
+// SICI: v_and_b32_e32 v0, lit(0x1), v1 ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
v_pk_add_u16 v5, exec_lo, 1
+// GFX11: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
// GFX12XX: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0x02,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1 ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_pk_add_u16 v5, exec_lo, lit(1)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x06,0x02]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x1) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
//---------------------------------------------------------------------------//
// 1/(2*PI)
@@ -948,46 +948,46 @@ v_trunc_f32_e32 v0, 0x3fc45f306dc9c882
// NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
v_fract_f64_e32 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
v_trunc_f32_e32 v0, 0x3e22f983
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
v_fract_f64_e32 v[0:1], 0x3e22f983
-// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
// GFX11: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983 ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
v_trunc_f32_e64 v0, 0x3fc45f306dc9c882
// NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
v_fract_f64_e64 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
// GFX11: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
v_trunc_f32_e64 v0, 0x3e22f983
-// GFX89: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:21: error: literal operands are not supported
// GFX11: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0.15915494 ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
// NOSICIVI: :[[@LINE-2]]:21: error: literal operands are not supported
v_fract_f64_e64 v[0:1], 0x3e22f983
+// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
// GFX12XX: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
// NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983 ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
// NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
@@ -996,37 +996,37 @@ s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
// NOSICIVI: :[[@LINE-2]]:23: error: invalid operand for instruction
v_and_b32_e32 v0, 0.159154943091895317852646485335, v1
-// SICI: v_and_b32_e32 v0, 0x3e22f983, v1 ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
-// GFX89: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36]
// GFX11: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.15915494, v1 ; encoding: [0xf8,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0x3e22f983, v1 ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
v_and_b32_e64 v0, 0.159154943091895317852646485335, v1
-// GFX89: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
-// NOSICI: :[[@LINE-3]]:19: error: literal operands are not supported
// GFX11: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.15915494, v1 ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
// NOSICIVI: :[[@LINE-2]]:19: error: literal operands are not supported
v_fract_f64 v[0:1], 0.159154943091895317852646485335
-// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30 ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
// NOSICI: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30 ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
// NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
v_trunc_f32 v0, 0.159154943091895317852646485335
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e]
// GFX11: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494 ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983 ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
v_trunc_f32 v0, lit(0.159154943091895317852646485335)
-// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
// GFX11: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983) ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
//---------------------------------------------------------------------------//
// integer literal truncation checks
@@ -1051,54 +1051,54 @@ v_trunc_f32 v0, 0x1fffffff000
// NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
s_mov_b64 s[0:1], 0x101ffffffff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
// GFX1250: s_mov_b64 s[0:1], 0x101ffffffff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x01,0x01,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
s_mov_b64 s[0:1], 0x1000000001
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
// GFX1250: s_mov_b64 s[0:1], 0x1000000001 ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
s_mov_b64 s[0:1], 0x1000000fff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
// GFX1250: s_mov_b64 s[0:1], 0x1000000fff ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0x0f,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
v_trunc_f64 v[0:1], 0x1fffffffff0
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
// GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffffff0 ; encoding: [0xfe,0x2e,0x00,0x7e,0xf0,0xff,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
// NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
v_trunc_f64 v[0:1], 0x100000001
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
// GFX1250: v_trunc_f64_e32 v[0:1], 0x100000001 ; encoding: [0xfe,0x2e,0x00,0x7e,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
// NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
v_trunc_f64 v[0:1], 0x1fffffff000
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
// GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffff000 ; encoding: [0xfe,0x2e,0x00,0x7e,0x00,0xf0,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
// NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
//---------------------------------------------------------------------------//
@@ -1106,210 +1106,210 @@ v_trunc_f64 v[0:1], 0x1fffffff000
//---------------------------------------------------------------------------//
buffer_atomic_add v0, off, s[0:3], scc offset:4095
-// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
-// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
-// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
// GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xfd]
+// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
+// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
+// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
s_add_i32 s0, vccz, s0
-// SICI: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81]
// GFX89: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: s_add_i32 s0, src_vccz, s0 ; encoding: [0xfb,0x00,0x00,0x81]
s_add_i32 s0, execz, s0
-// SICI: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81]
// GFX89: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// SICI: s_add_i32 s0, src_execz, s0 ; encoding: [0xfc,0x00,0x00,0x81]
s_add_i32 s0, scc, s0
-// SICI: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX89: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX12XX: s_add_co_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81]
// GFX11: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX12XX: s_add_co_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX89: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81]
+// SICI: s_add_i32 s0, src_scc, s0 ; encoding: [0xfd,0x00,0x00,0x81]
s_and_b64 s[0:1], s[0:1], src_vccz
-// SICI: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x87]
// GFX89: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_vccz ; encoding: [0x00,0xfb,0x80,0x87]
s_and_b64 s[0:1], s[0:1], src_execz
-// SICI: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x87]
// GFX89: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_execz ; encoding: [0x00,0xfc,0x80,0x87]
s_and_b64 s[0:1], s[0:1], src_scc
-// SICI: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x87]
-// GFX89: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x86]
-// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b]
// GFX11: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX89: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x86]
+// SICI: s_and_b64 s[0:1], s[0:1], src_scc ; encoding: [0x00,0xfd,0x80,0x87]
v_add_u16 v0, vccz, v0
// GFX89: v_add_u16_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x4c]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_add_u16_sdwa v0, scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u16_sdwa v0, src_scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xfd,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u16_sdwa v0, v0, scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u16_sdwa v0, v0, src_scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xfa,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u32 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u32_e32 v0, src_execz, v0 ; encoding: [0xfc,0x00,0x00,0x68]
-// NOVI: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX11: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u32_e64 v0, scc, v0
+// GFX11: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
// GFX12XX: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x34,0xd1,0xfd,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_scc, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_cmp_eq_i64 vcc, scc, v[0:1]
-// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0x44,0x7d]
// GFX89: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0xc4,0x7d]
-// NOGFX11: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1] ; encoding: [0xfd,0x00,0x44,0x7d]
v_max_f16 v0, execz, v0
// GFX89: v_max_f16_e32 v0, src_execz, v0 ; encoding: [0xfc,0x00,0x00,0x5a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_max_f32 v0, vccz, v0
-// SICI: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x20]
// GFX89: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x16]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: v_max_f32_e32 v0, src_vccz, v0 ; encoding: [0xfb,0x00,0x00,0x20]
v_max_f64 v[0:1], scc, v[0:1]
-// SICI: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
-// GFX89: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
-// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
// GFX11: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xfd,0x00,0x02,0x00]
+// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
+// GFX89: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
+// SICI: v_max_f64 v[0:1], src_scc, v[0:1] ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
v_pk_add_f16 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_pk_add_f16 v0, src_execz, v0 ; encoding: [0x00,0x40,0x8f,0xd3,0xfc,0x00,0x02,0x18]
-// NOVI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:18: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:18: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:18: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:18: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f16 v0, neg(vccz)
// GFX89: v_ceil_f16_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x85,0xd1,0xfb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_ceil_f16 v0, abs(scc)
-// GFX89: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
-// GFX12XX: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
// GFX11: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX12XX: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX89: v_ceil_f16_e64 v0, |src_scc| ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
v_ceil_f64 v[5:6], |execz|
-// GFX89: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
// CI: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x30,0xd3,0xfc,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:21: error: src_execz register not available on this GPU
+// GFX89: v_ceil_f64_e64 v[5:6], |src_execz| ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
+// NOGFX11: :[[@LINE-3]]:21: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
v_ceil_f64 v[5:6], -vcc
-// GFX89: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
// CI: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x30,0xd3,0x6a,0x00,0x00,0x20]
// GFX11: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
// GFX12: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
-// NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:12: error: invalid operand for instruction
+// GFX89: v_ceil_f64_e64 v[5:6], -vcc ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
v_ceil_f32 v0, -vccz
-// SICI: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
// GFX89: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x5d,0xd1,0xfb,0x00,0x00,0x20]
-// NOGFX11: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, -src_vccz ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
v_ceil_f32 v0, |execz|
-// SICI: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
// GFX89: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x5d,0xd1,0xfc,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, |src_execz| ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
v_ceil_f16_sdwa v5, |vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_ceil_f16_sdwa v5, |src_vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f16_sdwa v5, -scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_ceil_f16_sdwa v5, -src_scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfd,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f32_sdwa v5, vccz dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
// GFX9: v_ceil_f32_sdwa v5, src_vccz dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
// GFX9: v_ceil_f32_sdwa v5, |src_execz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfc,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
// NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
//---------------------------------------------------------------------------//
@@ -1317,266 +1317,266 @@ v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
//---------------------------------------------------------------------------//
buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095
-// NOSICI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
-// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
// GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xeb]
-// NOVI: :[[@LINE-4]]:36: error: src_shared_base register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:36: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:36: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
s_add_i32 s0, src_shared_base, s0
+// GFX11: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81]
// GFX12XX: s_add_co_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
// GFX9: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_base, s0 ; encoding: [0xeb,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
// NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
s_add_i32 s0, src_shared_limit, s0
+// GFX11: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81]
// GFX12XX: s_add_co_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_limit register not available on this GPU
// GFX9: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_limit, s0 ; encoding: [0xec,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_limit register not available on this GPU
// NOVI: :[[@LINE-5]]:15: error: src_shared_limit register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:15: error: src_shared_limit register not available on this GPU
s_add_i32 s0, src_private_base, s0
+// GFX11: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81]
// GFX12XX: s_add_co_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_base register not available on this GPU
// GFX9: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_base, s0 ; encoding: [0xed,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
// NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:15: error: src_private_base register not available on this GPU
s_add_i32 s0, src_private_limit, s0
+// GFX11: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81]
// GFX12XX: s_add_co_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_limit register not available on this GPU
// GFX9: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_limit, s0 ; encoding: [0xee,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_limit register not available on this GPU
// NOVI: :[[@LINE-5]]:15: error: src_private_limit register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:15: error: src_private_limit register not available on this GPU
s_add_i32 s0, src_pops_exiting_wave_id, s0
-// NOSICI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
// GFX9: s_add_i32 s0, src_pops_exiting_wave_id, s0 ; encoding: [0xef,0x00,0x00,0x81]
-// NOVI: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
s_and_b64 s[0:1], s[0:1], src_shared_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
// GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_base register not available on this GPU
// GFX9: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_base register not available on this GPU
// NOVI: :[[@LINE-5]]:27: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:27: error: src_shared_base register not available on this GPU
s_and_b64 s[0:1], s[0:1], src_shared_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
// GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_limit register not available on this GPU
// GFX9: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_limit register not available on this GPU
// NOVI: :[[@LINE-5]]:27: error: src_shared_limit register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:27: error: src_shared_limit register not available on this GPU
s_and_b64 s[0:1], s[0:1], src_private_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
// GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_base register not available on this GPU
// GFX9: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_base register not available on this GPU
// NOVI: :[[@LINE-5]]:27: error: src_private_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:27: error: src_private_base register not available on this GPU
s_and_b64 s[0:1], s[0:1], src_private_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
// GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_limit register not available on this GPU
// GFX9: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_limit register not available on this GPU
// NOVI: :[[@LINE-5]]:27: error: src_private_limit register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:27: error: src_private_limit register not available on this GPU
s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id
-// NOSICI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
// GFX9: s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id ; encoding: [0x00,0xef,0x80,0x86]
-// NOVI: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
v_add_u16 v0, src_shared_base, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4c]
-// NOVI: :[[@LINE-3]]:15: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xeb,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xd6,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u32 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
// GFX12XX: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x68]
-// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u32_e64 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
// GFX12XX: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_add_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x34,0xd1,0xeb,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_cmp_eq_i64 vcc, src_shared_base, v[0:1]
-// NOSICI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
// GFX9: v_cmp_eq_i64_e32 vcc, src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0xc4,0x7d]
-// NOVI: :[[@LINE-3]]:19: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:19: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
v_max_f16 v0, src_shared_base, v0
+// GFX11: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x72]
// GFX12XX: v_max_num_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x62]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x5a]
-// GFX11: v_max_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x72]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_max_f32 v0, src_shared_base, v0
+// GFX11: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x20]
// GFX12XX: v_max_num_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x2c]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
// GFX9: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x16]
-// GFX11: v_max_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
// NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
v_max_f64 v[0:1], src_shared_base, v[0:1]
+// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
// GFX12XX: v_max_num_f64_e32 v[0:1], src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0x00,0x1c]
-// NOSICI: :[[@LINE-2]]:19: error: src_shared_base register not available on this GPU
// GFX9: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xeb,0x00,0x02,0x00]
-// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: src_shared_base register not available on this GPU
// NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
v_pk_add_f16 v0, src_shared_base, v0
+// GFX11: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
// GFX12XX: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x8f,0xd3,0xeb,0x00,0x02,0x18]
-// GFX11: v_pk_add_f16 v0, src_shared_base, v0 ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f16 v0, neg(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
// GFX12XX: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x85,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f16_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f16 v0, abs(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
// GFX12XX: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX9: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x85,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f16_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f64 v[5:6], |src_shared_base|
-// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
// GFX11: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
// GFX12: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
// NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
v_ceil_f64 v[5:6], -src_shared_base
-// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
// GFX11: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
// GFX12: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
// NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
v_ceil_f32 v0, -src_shared_base
+// GFX11: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
// GFX12XX: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
// GFX9: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0x5d,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f32_e64 v0, -src_shared_base ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
// NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
v_ceil_f32 v0, |src_shared_base|
+// GFX11: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
// GFX12XX: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
// GFX9: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0x5d,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f32_e64 v0, |src_shared_base| ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
// NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// GFX9: v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
// GFX9: v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
// GFX9: v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
//---------------------------------------------------------------------------//
@@ -1584,206 +1584,206 @@ v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
//---------------------------------------------------------------------------//
v_add_u32 v0, private_base, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_add_u32 v0, scc, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
// v_div_fmas implicitly reads VCC
v_div_fmas_f32 v0, shared_base, v0, v1
-// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOSICI: :[[@LINE-2]]:20: error: src_shared_base register not available on this GPU
// GFX11: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOVI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:20: error: src_shared_base register not available on this GPU
// v_div_fmas implicitly reads VCC
v_div_fmas_f32 v0, v0, shared_limit, v1
-// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: src_shared_limit register not available on this GPU
// GFX11: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOVI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:24: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
+// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:24: error: src_shared_limit register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:24: error: src_shared_limit register not available on this GPU
// v_div_fmas implicitly reads VCC
v_div_fmas_f32 v0, v0, v1, private_limit
-// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOSICI: :[[@LINE-2]]:28: error: src_private_limit register not available on this GPU
// GFX11: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOVI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
+// NOGFX9: :[[@LINE-3]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:28: error: src_private_limit register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:28: error: src_private_limit register not available on this GPU
// v_div_fmas implicitly reads VCC
v_div_fmas_f32 v0, execz, v0, v1
-// NOSICI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
// NOSICIVI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
// v_div_fmas implicitly reads VCC
v_div_fmas_f32 v0, v0, scc, v1
+// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
// GFX12XX: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: invalid operand (violates constant bus restrictions)
// NOGFX89: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
+// NOSICI: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
// NOSICIVI: :[[@LINE-1]]:24: error: invalid operand (violates constant bus restrictions)
// v_div_fmas implicitly reads VCC
v_div_fmas_f32 v0, v0, v1, vccz
-// NOSICI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:28: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:28: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:28: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:28: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
// NOSICIVI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
// v_addc_co_u32 implicitly reads VCC (VOP2)
v_addc_co_u32 v0, vcc, shared_base, v0, vcc
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_madak_f32 v0, shared_base, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:17: error: src_shared_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
v_madak_f32 v0, scc, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: invalid operand (violates constant bus restrictions)
// NOSICIVI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
v_madak_f32 v0, 0xff32ff, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
// NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
v_madak_f32 v0, 0xff32ff, v0, 1
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
// NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
v_madmk_f32 v0, 0xff32ff, 0x11213141, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
// NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
v_madmk_f32 v0, 0xff32ff, -1, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
// NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
v_madak_f16 v0, 0xff32, v0, 0x1122
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_madak_f16 v0, 0xff32, v0, 0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_madmk_f16 v0, 0xff32, 0x1122, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_madmk_f16 v0, 0xff32, 1, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_cmp_eq_f32 s[0:1], private_base, private_limit
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
v_cmp_eq_f32 s[0:1], private_base, s0
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
// NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
v_cmp_eq_f32 s[0:1], execz, s0
-// NOSICI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:22: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:22: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:22: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:22: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
// NOSICIVI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
v_pk_add_f16 v255, private_base, private_limit
-// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
// GFX11: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:34: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
+// NOGFX9: :[[@LINE-3]]:34: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
v_pk_add_f16 v255, vccz, execz
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:26: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX9: :[[@LINE-4]]:26: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
// NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
//---------------------------------------------------------------------------//
@@ -1791,36 +1791,36 @@ v_pk_add_f16 v255, vccz, execz
//---------------------------------------------------------------------------//
v_sqrt_f32 v2, lit(123)
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
// GFX11: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
v_sqrt_f32 v2, abs(lit(123))
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
// GFX11: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b) ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
v_sqrt_f32 v2, lit(123.0)
-// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
// GFX11: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000) ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
v_sqrt_f64 v[2:3], lit(123.0)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
// GFX11: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
// GFX12: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
// GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xfe,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000) ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
v_sqrt_f64 v[2:3], lit(123)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
// GFX11: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
// GFX12: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
// GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xfe,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b) ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
v_sqrt_f32 v2, lit 123.0
// NOGCN: :[[@LINE-1]]:20: error: expected left paren after lit
@@ -1834,16 +1834,16 @@ v_sqrt_f32 v2, lit(v1)
// Make sure lit() is accepted on operands without modifiers.
v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8)
-// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
// GFX89: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x30,0xe8,0x07,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
v_madak_f32 v4, lit(lit(0x7e8)), v8, lit(0x7e8)
-// NOSICI: :[[@LINE-1]]:24: error: not a valid operand.
-// NOGFX89: :[[@LINE-2]]:24: error: not a valid operand.
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:24: error: not a valid operand.
+// NOSICI: :[[@LINE-5]]:24: error: not a valid operand.
// NOSICIVI: :[[@LINE-1]]:24: error: not a valid operand.
diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
new file mode 100644
index 0000000..ae3c746
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
@@ -0,0 +1,338 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=dse -S %s | FileCheck %s
+
+define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ %l = load double, ptr %src
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @live_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ %l.2 = load double, ptr %dst
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @live_strided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 200, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @dead_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+ %l.2 = load double, ptr %src
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @live_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+ %l.2 = load double, ptr %dst
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @live_dynamically_strided_store(ptr %ptr, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @dead_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+ %l.2 = load double, ptr %src
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @live_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+ %l.2 = load double, ptr %dst
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @live_unstrided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_unstrided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @dead_non_matrix_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_non_matrix_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[DST_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 6
+; CHECK-NEXT: store double 4.200000e+01, ptr [[DST_OFFSET]], align 8
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ %dst.offset = getelementptr inbounds double, ptr %src, i32 6
+ store double 42.0, ptr %dst.offset
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @live_non_matrix_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_non_matrix_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[PTR_OFFSET:%.*]] = getelementptr inbounds double, ptr [[PTR]], i32 6
+; CHECK-NEXT: store double 4.200000e+01, ptr [[PTR_OFFSET]], align 8
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ %ptr.offset = getelementptr inbounds double, ptr %ptr, i32 6
+ store double 42.0, ptr %ptr.offset
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+ ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ store <8 x double> zeroinitializer, ptr %dst
+ ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: store <16 x double> zeroinitializer, ptr [[DST]], align 128
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+ store <16 x double> zeroinitializer, ptr %dst
+ ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: store <4 x double> zeroinitializer, ptr [[DST]], align 32
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ store <4 x double> zeroinitializer, ptr %dst
+ ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+ store <8 x double> zeroinitializer, ptr %dst
+ ret void
+}
+
+define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT: ret void
+;
+entry:
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+ ret void
+}
+
+define void @live_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: ret void
+;
+entry:
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+ call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+ ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
new file mode 100644
index 0000000..78dbfe1
--- /dev/null
+++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=gvn -S %s | FileCheck %s
+
+define void @redundant_unstrided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
+; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %src.offset = getelementptr inbounds double, ptr %src, i32 8
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2)
+ %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+ call void @use(<8 x double> %l)
+ call void @use(<8 x double> %l.2)
+ ret void
+}
+
+define void @redundant_unstrided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
+; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %src.offset = getelementptr inbounds double, ptr %src, i32 1
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+ store double 42.0, ptr %src
+ %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+ call void @use(<8 x double> %l)
+ call void @use(<8 x double> %l.2)
+ ret void
+}
+
+define void @redundant_strided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
+; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %src.offset = getelementptr inbounds double, ptr %src, i32 16
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+ call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+ %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+ call void @use(<8 x double> %l)
+ call void @use(<8 x double> %l.2)
+ ret void
+
+}
+
+define void @redundant_strided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT: [[L_2:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
+; CHECK-NEXT: call void @use(<8 x double> [[L_2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %src.offset = getelementptr inbounds double, ptr %src, i32 16
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+ store double 42.0, ptr %src
+ %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+ call void @use(<8 x double> %l)
+ call void @use(<8 x double> %l.2)
+ ret void
+}
+
+define void @repeat_load_dimension_change_project(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_project(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
+; CHECK-NEXT: call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+ %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ call void @use(<8 x double> %l)
+ call void @use(<8 x double> %l.3)
+ ret void
+}
+
+define void @repeat_load_dimension_change_shuffle(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_shuffle(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT: [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT: [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: call void @use(<8 x double> [[L]])
+; CHECK-NEXT: call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+ %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+ %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ call void @use(<8 x double> %l)
+ call void @use(<8 x double> %l.3)
+ ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
+declare void @use(<8 x double>)
diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
deleted file mode 100644
index 7816781..0000000
--- a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
+++ /dev/null
@@ -1,243 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -passes=instcombine | FileCheck %s
-@A = extern_weak global float, align 4
-
-; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true
-; and false values, while %v1 and %phi.to.remove are actually the same.
-; Fold the selection instruction %same.as.v1 to %v1.
-define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
-; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK: exit:
-; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT: ret void
-;
-entry:
- br label %for.body
-
-for.body: ; preds = %entry, %for.body
- %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
- %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
- %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
- %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
- %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
- %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
- %q.load = load float, ptr %q
- %c.load = load float, ptr %c
- %sub = fsub float %q.load, %c.load
- %cmp1 = fcmp olt float %sub, %v0
- %v0.1 = select i1 %cmp1, float %sub, float %v0
- %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
- %cmp2 = fcmp ogt float %sub, %same.as.v1
- %v1.1 = select i1 %cmp2, float %sub, float %v1
- %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
- %inc.i = add nuw nsw i32 %i, 1
- %q.next = getelementptr inbounds i8, ptr %q, i64 4
- %c.next = getelementptr inbounds i8, ptr %c, i64 4
- %exitcond = icmp eq i32 %inc.i, %count
- br i1 %exitcond, label %exit, label %for.body
-
-exit:
- %vl.1.lcssa = phi float [ %v1.1, %for.body ]
- store float %vl.1.lcssa, ptr @A
- ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; %phi.to.remove.next and %v1.1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_2(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]]
-; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK: exit:
-; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT: ret void
-;
-entry:
- br label %for.body
-
-for.body: ; preds = %entry, %for.body
- %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
- %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
- %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
- %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
- %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
- %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
- %q.load = load float, ptr %q
- %c.load = load float, ptr %c
- %sub = fsub float %q.load, %c.load
- %cmp1 = fcmp olt float %sub, %v0
- %v0.1 = select i1 %cmp1, float %sub, float %v0
- %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
- %cmp2 = fcmp ogt float %sub, %same.as.v1
- %v1.1 = select i1 %cmp2, float %v1, float %sub
- %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub
- %inc.i = add nuw nsw i32 %i, 1
- %q.next = getelementptr inbounds i8, ptr %q, i64 4
- %c.next = getelementptr inbounds i8, ptr %c, i64 4
- %exitcond = icmp eq i32 %inc.i, %count
- br i1 %exitcond, label %exit, label %for.body
-
-exit:
- %vl.1.lcssa = phi float [ %v1.1, %for.body ]
- store float %vl.1.lcssa, ptr @A
- ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; same.as.v1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_3(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_3(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
-; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK: exit:
-; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT: ret void
-;
-entry:
- br label %for.body
-
-for.body: ; preds = %entry, %for.body
- %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
- %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
- %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
- %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
- %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
- %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
- %q.load = load float, ptr %q
- %c.load = load float, ptr %c
- %sub = fsub float %q.load, %c.load
- %cmp1 = fcmp olt float %sub, %v0
- %v0.1 = select i1 %cmp1, float %sub, float %v0
- %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
- %cmp2 = fcmp ogt float %sub, %same.as.v1
- %v1.1 = select i1 %cmp2, float %sub, float %v1
- %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
- %inc.i = add nuw nsw i32 %i, 1
- %q.next = getelementptr inbounds i8, ptr %q, i64 4
- %c.next = getelementptr inbounds i8, ptr %c, i64 4
- %exitcond = icmp eq i32 %inc.i, %count
- br i1 %exitcond, label %exit, label %for.body
-
-exit:
- %vl.1.lcssa = phi float [ %v1.1, %for.body ]
- store float %vl.1.lcssa, ptr @A
- ret void
-}
-
-; The difference from select_with_identical_phi() is that the true and false values in
-; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped.
-; Check that %same.as.v1 can be folded.
-define void @select_with_identical_phi_4(ptr %m, ptr %n, i32 %count) {
-; CHECK-LABEL: @select_with_identical_phi_4(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
-; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
-; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
-; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
-; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[V1]], float [[SUB]]
-; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1
-; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
-; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK: exit:
-; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4
-; CHECK-NEXT: ret void
-;
-entry:
- br label %for.body
-
-for.body: ; preds = %entry, %for.body
- %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
- %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
- %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
- %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
- %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
- %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
- %q.load = load float, ptr %q
- %c.load = load float, ptr %c
- %sub = fsub float %q.load, %c.load
- %cmp1 = fcmp olt float %sub, %v0
- %v0.1 = select i1 %cmp1, float %sub, float %v0
- %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
- %cmp2 = fcmp ogt float %sub, %same.as.v1
- %v1.1 = select i1 %cmp2, float %v1, float %sub
- %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %sub
- %inc.i = add nuw nsw i32 %i, 1
- %q.next = getelementptr inbounds i8, ptr %q, i64 4
- %c.next = getelementptr inbounds i8, ptr %c, i64 4
- %exitcond = icmp eq i32 %inc.i, %count
- br i1 %exitcond, label %exit, label %for.body
-
-exit:
- %vl.1.lcssa = phi float [ %v1.1, %for.body ]
- store float %vl.1.lcssa, ptr @A
- ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 11cc971..fb7890a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -417,21 +417,17 @@ for.end: ; preds = %for.body, %entry
; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1)
define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @memory_dependence
; CHECK: vector.body:
; CHECK: %[[LOAD1:.*]] = load <4 x i32>
; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]]
; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]])
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index 829acbbf..305a692 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -210,3 +210,175 @@ loop:
exit:
ret void
}
+
+define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) {
+; IC1-LABEL: define void @test_masked_interleave_group(
+; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; IC1-NEXT: [[ENTRY:.*:]]
+; IC1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; IC1-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IC1: [[VECTOR_MEMCHECK]]:
+; IC1-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; IC1-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; IC1-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; IC1-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; IC1-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; IC1-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; IC1-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; IC1-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; IC1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; IC1-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; IC1-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; IC1-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; IC1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; IC1-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IC1: [[VECTOR_PH]]:
+; IC1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC1-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; IC1-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; IC1-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; IC1-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; IC1-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1: [[VECTOR_BODY]]:
+; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; IC1-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; IC1-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; IC1-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; IC1-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; IC1-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; IC1-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; IC1-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; IC1-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; IC1-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; IC1-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; IC1-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IC1: [[MIDDLE_BLOCK]]:
+; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC1-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; IC1: [[SCALAR_PH]]:
+;
+; CHECK-LABEL: define void @test_masked_interleave_group(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT: call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+;
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ]
+ %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ]
+ %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ]
+ %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1
+ %mask.val = load i8, ptr %mask.iv, align 1
+ %should.copy = icmp eq i8 %mask.val, 0
+ br i1 %should.copy, label %then, label %loop.latch
+
+then:
+ %elem0 = load float, ptr %src.iv, align 4
+ store float %elem0, ptr %dst.iv, align 4
+ %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4
+ %s1 = load float, ptr %src.1.ptr, align 4
+ %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4
+ store float %s1, ptr %dst.1.ptr, align 4
+ %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8
+ %s2 = load float, ptr %src.2.ptr, align 4
+ %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8
+ store float %s2, ptr %dst.2.ptr, align 4
+ %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12
+ %s3 = load float, ptr %src.3.ptr, align 4
+ %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12
+ store float %s3, ptr %dst.3.ptr, align 4
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add i32 %iv, 1
+ %src.iv.next = getelementptr i8, ptr %src.iv, i64 16
+ %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16
+ %ec = icmp eq i32 %iv, %N
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index c8d20dc..e42e2c7 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi"
%struct.TwoInts = type { i32, i32 }
%struct.ThreeInts = type { i32, i32, i32 }
%struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
%struct.ThreeShorts = type { i16, i16, i16 }
%struct.FourShorts = type { i16, i16, i16, i16 }
%struct.TwoBytes = type { i8, i8 }
@@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi"
%struct.FourBytes = type { i8, i8, i8, i8 }
%struct.FiveBytes = type { i8, i8, i8, i8, i8 }
%struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
; CHECK-LABEL: two_ints_same_op
; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
@@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
34: ; preds = %6, %4
ret void
}
+
+; CHECK-LABEL: two_floats_same_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp21.not = icmp eq i32 %N, 0
+ br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+ store float %mul, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %mul8, ptr %y10, align 4
+ %inc = add nuw i32 %i.022, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp20.not = icmp eq i32 %N, 0
+ br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+ store float %add, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %sub, ptr %y9, align 4
+ %inc = add nuw i32 %i.021, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp24.not = icmp eq i32 %N, 0
+ br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %inc = add nuw i32 %i.025, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp23.not = icmp eq i32 %N, 0
+ br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %add = fadd float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+ store float %add, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %sub = fsub float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %sub, ptr %y12, align 4
+ %inc = add nuw i32 %i.024, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp22.not = icmp eq i32 %N, 0
+ br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i8
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv9, ptr %y11, align 1
+ %inc = add nuw i32 %i.023, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp21.not = icmp eq i32 %N, 0
+ br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %conv = fptosi float %add to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %conv8 = fptosi float %sub to i8
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv8, ptr %y10, align 1
+ %inc = add nuw i32 %i.022, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp24.not = icmp eq i32 %N, 0
+ br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %inc = add nuw i32 %i.025, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp23.not = icmp eq i32 %N, 0
+ br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %add = fadd float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+ store float %add, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %sub = fsub float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %sub, ptr %y12, align 4
+ %inc = add nuw i32 %i.024, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp22.not = icmp eq i32 %N, 0
+ br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i16
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv9, ptr %y11, align 2
+ %inc = add nuw i32 %i.023, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp21.not = icmp eq i32 %N, 0
+ br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %conv = fptosi float %add to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %conv8 = fptosi float %sub to i16
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv8, ptr %y10, align 2
+ %inc = add nuw i32 %i.022, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp45.not = icmp eq i32 %N, 0
+ br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+ store float %mul, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %mul8, ptr %y10, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z13, align 4
+ %mul14 = fmul float %4, %5
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+ store float %mul14, ptr %z16, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w19, align 4
+ %mul20 = fmul float %6, %7
+ %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+ store float %mul20, ptr %w22, align 4
+ %inc = add nuw i32 %i.046, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 1
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp42.not = icmp eq i32 %N, 0
+ br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+ store float %add, ptr %arrayidx3, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %sub = fsub float %2, %3
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store float %sub, ptr %y9, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z12, align 4
+ %mul = fmul float %4, %5
+ %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+ store float %mul, ptr %z14, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w17, align 4
+ %div = fdiv float %6, %7
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+ store float %div, ptr %w19, align 4
+ %inc = add nuw i32 %i.043, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp52.not = icmp eq i32 %N, 0
+ br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %4 = load i8, ptr %z, align 1
+ %conv15 = sitofp i8 %4 to float
+ %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %5 = load i8, ptr %z17, align 1
+ %conv18 = sitofp i8 %5 to float
+ %mul19 = fmul float %conv15, %conv18
+ %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %mul19, ptr %z21, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+ %6 = load i8, ptr %w, align 1
+ %conv23 = sitofp i8 %6 to float
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+ %7 = load i8, ptr %w25, align 1
+ %conv26 = sitofp i8 %7 to float
+ %mul27 = fmul float %conv23, %conv26
+ %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %mul27, ptr %w29, align 4
+ %inc = add nuw i32 %i.053, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp49.not = icmp eq i32 %N, 0
+ br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = sitofp i8 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+ %1 = load i8, ptr %arrayidx1, align 1
+ %conv3 = sitofp i8 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+ %2 = load i8, ptr %y, align 1
+ %conv7 = sitofp i8 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+ %3 = load i8, ptr %y9, align 1
+ %conv10 = sitofp i8 %3 to float
+ %add = fadd float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %add, ptr %y12, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %4 = load i8, ptr %z, align 1
+ %conv14 = sitofp i8 %4 to float
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %5 = load i8, ptr %z16, align 1
+ %conv17 = sitofp i8 %5 to float
+ %div = fdiv float %conv14, %conv17
+ %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %div, ptr %z19, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+ %6 = load i8, ptr %w, align 1
+ %conv21 = sitofp i8 %6 to float
+ %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+ %7 = load i8, ptr %w23, align 1
+ %conv24 = sitofp i8 %7 to float
+ %sub = fsub float %conv21, %conv24
+ %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %sub, ptr %w26, align 4
+ %inc = add nuw i32 %i.050, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp48.not = icmp eq i32 %N, 0
+ br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i8
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv9, ptr %y11, align 1
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z14, align 4
+ %mul15 = fmul float %4, %5
+ %conv16 = fptosi float %mul15 to i8
+ %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i8 %conv16, ptr %z18, align 1
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w21, align 4
+ %mul22 = fmul float %6, %7
+ %conv23 = fptosi float %mul22 to i8
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+ store i8 %conv23, ptr %w25, align 1
+ %inc = add nuw i32 %i.049, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp45.not = icmp eq i32 %N, 0
+ br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i8
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+ store i8 %conv, ptr %arrayidx3, align 1
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %add = fadd float %2, %3
+ %conv8 = fptosi float %add to i8
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+ store i8 %conv8, ptr %y10, align 1
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z13, align 4
+ %div = fdiv float %4, %5
+ %conv14 = fptosi float %div to i8
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i8 %conv14, ptr %z16, align 1
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w19, align 4
+ %sub = fsub float %6, %7
+ %conv20 = fptosi float %sub to i8
+ %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+ store i8 %conv20, ptr %w22, align 1
+ %inc = add nuw i32 %i.046, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp52.not = icmp eq i32 %N, 0
+ br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %mul11 = fmul float %conv7, %conv10
+ %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %mul11, ptr %y13, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %4 = load i16, ptr %z, align 2
+ %conv15 = sitofp i16 %4 to float
+ %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %5 = load i16, ptr %z17, align 2
+ %conv18 = sitofp i16 %5 to float
+ %mul19 = fmul float %conv15, %conv18
+ %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %mul19, ptr %z21, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+ %6 = load i16, ptr %w, align 2
+ %conv23 = sitofp i16 %6 to float
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+ %7 = load i16, ptr %w25, align 2
+ %conv26 = sitofp i16 %7 to float
+ %mul27 = fmul float %conv23, %conv26
+ %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %mul27, ptr %w29, align 4
+ %inc = add nuw i32 %i.053, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp49.not = icmp eq i32 %N, 0
+ br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+ %0 = load i16, ptr %arrayidx, align 2
+ %conv = sitofp i16 %0 to float
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+ %1 = load i16, ptr %arrayidx1, align 2
+ %conv3 = sitofp i16 %1 to float
+ %mul = fmul float %conv, %conv3
+ %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+ store float %mul, ptr %arrayidx4, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+ %2 = load i16, ptr %y, align 2
+ %conv7 = sitofp i16 %2 to float
+ %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+ %3 = load i16, ptr %y9, align 2
+ %conv10 = sitofp i16 %3 to float
+ %add = fadd float %conv7, %conv10
+ %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+ store float %add, ptr %y12, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %4 = load i16, ptr %z, align 2
+ %conv14 = sitofp i16 %4 to float
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %5 = load i16, ptr %z16, align 2
+ %conv17 = sitofp i16 %5 to float
+ %div = fdiv float %conv14, %conv17
+ %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+ store float %div, ptr %z19, align 4
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+ %6 = load i16, ptr %w, align 2
+ %conv21 = sitofp i16 %6 to float
+ %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+ %7 = load i16, ptr %w23, align 2
+ %conv24 = sitofp i16 %7 to float
+ %sub = fsub float %conv21, %conv24
+ %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+ store float %sub, ptr %w26, align 4
+ %inc = add nuw i32 %i.050, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp48.not = icmp eq i32 %N, 0
+ br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %mul8 = fmul float %2, %3
+ %conv9 = fptosi float %mul8 to i16
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv9, ptr %y11, align 2
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z14, align 4
+ %mul15 = fmul float %4, %5
+ %conv16 = fptosi float %mul15 to i16
+ %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store i16 %conv16, ptr %z18, align 2
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w21, align 4
+ %mul22 = fmul float %6, %7
+ %conv23 = fptosi float %mul22 to i16
+ %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+ store i16 %conv23, ptr %w25, align 2
+ %inc = add nuw i32 %i.049, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+ %cmp45.not = icmp eq i32 %N, 0
+ br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ ret void
+
+for.body: ; preds = %entry, %for.body
+ %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+ %0 = load float, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+ %1 = load float, ptr %arrayidx1, align 4
+ %mul = fmul float %0, %1
+ %conv = fptosi float %mul to i16
+ %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+ store i16 %conv, ptr %arrayidx3, align 2
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+ %2 = load float, ptr %y, align 4
+ %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+ %3 = load float, ptr %y7, align 4
+ %add = fadd float %2, %3
+ %conv8 = fptosi float %add to i16
+ %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+ store i16 %conv8, ptr %y10, align 2
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+ %4 = load float, ptr %z, align 4
+ %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+ %5 = load float, ptr %z13, align 4
+ %div = fdiv float %4, %5
+ %conv14 = fptosi float %div to i16
+ %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+ store i16 %conv14, ptr %z16, align 2
+ %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+ %6 = load float, ptr %w, align 4
+ %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+ %7 = load float, ptr %w19, align 4
+ %sub = fsub float %6, %7
+ %conv20 = fptosi float %sub to i16
+ %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+ store i16 %conv20, ptr %w22, align 2
+ %inc = add nuw i32 %i.046, 1
+ %exitcond.not = icmp eq i32 %inc, %N
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
new file mode 100644
index 0000000..01934b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified interleave count is ignored.
+
+; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-LABEL: @loop_distance_4
+define void @loop_distance_4(ptr %a, ptr %b) {
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ]
+ %0 = getelementptr i32, ptr %b, i64 %iv
+ %arrayidx = getelementptr i8, ptr %0, i64 -16
+ %1 = load i32, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+ %2 = load i32, ptr %arrayidx2, align 4
+ %add = add nsw i32 %2, %1
+ store i32 %add, ptr %0, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 64
+ br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1
+
+for.end:
+ ret void
+}
+
+!1 = !{!1, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
index a3cf23b..f793814 100644
--- a/llvm/test/Transforms/SCCP/conditions-ranges.ll
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -1547,3 +1547,28 @@ bb2:
call void @use(i1 %c4)
ret void
}
+
+define i1 @and_predicate_dominating_phi(i32 %x) {
+; CHECK-LABEL: @and_predicate_dominating_phi(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK: nope:
+; CHECK-NEXT: br label [[PHI]]
+; CHECK: phi:
+; CHECK-NEXT: ret i1 true
+;
+entry:
+ %xge1 = icmp uge i32 %x, 1
+ %xlt2 = icmp ult i32 %x, 2
+ %and = and i1 %xge1, %xlt2
+ br i1 %and, label %phi, label %nope
+nope:
+ br label %phi
+phi:
+ %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+ %ret = icmp uge i32 %res, 1
+ ret i1 %ret
+}
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index 2e96a92..cc1dc4e 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) {
call void @foo(i1 %a15)
ret void
}
+
+define i32 @test_and_with_phinode(i32 %x) {
+; CHECK-LABEL: @test_and_with_phinode(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT: [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT: [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK: [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK: [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT: br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK: nope:
+; CHECK-NEXT: br label [[PHI]]
+; CHECK: phi:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %xge1 = icmp uge i32 %x, 1
+ %xlt2 = icmp ult i32 %x, 2
+ %and = and i1 %xge1, %xlt2
+ br i1 %and, label %phi, label %nope
+nope:
+ br label %phi
+phi:
+ %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+ ret i32 %res
+}
diff --git a/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
new file mode 100644
index 0000000..10566ae
--- /dev/null
+++ b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
@@ -0,0 +1,132 @@
+; -stats requires asserts
+; REQUIRES: asserts
+
+; Check that we can still devirtualize outside LTO mode when speculative devirtualization is enabled.
+; Check that we skip devirtualization for empty functions in speculative devirtualization mode
+
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively \
+; RUN: -pass-remarks=wholeprogramdevirt -stats %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: remark: devirt-single.cc:30:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:41:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:51:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:13:0: devirtualized vf
+; CHECK-NOT: devirtualized
+
+@vt1 = constant [1 x ptr] [ptr @vf], !type !8
+@vt2 = constant [1 x ptr] [ptr @vf_empty], !type !12
+
+define i1 @vf(ptr %this) #0 !dbg !7 {
+ ret i1 true
+}
+
+; This should NOT be devirtualized because during non-lto empty functions
+; are skipped.
+define void @vf_empty(ptr %this) !dbg !11 {
+ ret void
+}
+
+; CHECK: define void @call
+define void @call(ptr %obj) #1 !dbg !5 {
+ %vtable = load ptr, ptr %obj
+ %p = call i1 @llvm.public.type.test(ptr %vtable, metadata !"typeid")
+ call void @llvm.assume(i1 %p)
+ %fptr = load ptr, ptr %vtable
+ ; CHECK: if.true.direct_targ:
+ ; CHECK: call i1 @vf(
+ ; CHECK: if.false.orig_indirect:
+ ; CHECK: call i1 %fptr(
+ call i1 %fptr(ptr %obj), !dbg !6
+ ret void
+}
+
+
+; CHECK: define void @call1
+define void @call1(ptr %obj) #1 !dbg !9 {
+ %vtable = load ptr, ptr %obj
+ %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid1")
+ call void @llvm.assume(i1 %p)
+ %fptr = load ptr, ptr %vtable, align 8
+ ; CHECK: call i1 %fptr
+ %1 = call i1 %fptr(ptr %obj), !dbg !10
+ ret void
+}
+declare ptr @llvm.load.relative.i32(ptr, i32)
+
+@vt3 = private unnamed_addr constant [1 x i32] [
+ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr @vt3 to i64)) to i32)
+], align 4, !type !15
+
+; CHECK: define void @call2
+define void @call2(ptr %obj) #1 !dbg !13 {
+ %vtable = load ptr, ptr %obj
+ %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2")
+ call void @llvm.assume(i1 %p)
+ %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 0)
+ ; CHECK: if.true.direct_targ:
+ ; CHECK: call i1 @vf(
+ ; CHECK: if.false.orig_indirect:
+ ; CHECK: call i1 %fptr(
+ call i1 %fptr(ptr %obj), !dbg !14
+ ret void
+}
+
+@_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [
+ i32 0, ; offset to top
+ i32 0, ; rtti
+ i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32) ; vf_emptyunc offset
+] }, align 4, !type !18
+
+; CHECK: define void @call3
+define void @call3(ptr %obj) #1 !dbg !16 {
+ %vtable = load ptr, ptr %obj
+ %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid3")
+ call void @llvm.assume(i1 %p)
+ %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8)
+ ; CHECK: if.true.direct_targ:
+ ; CHECK: call i1 @vf(
+ ; CHECK: if.false.orig_indirect:
+ ; CHECK: call i1 %fptr(
+ call i1 %fptr(ptr %obj), !dbg !17
+ ret void
+}
+
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare i1 @llvm.public.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 (trunk 278098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "devirt-single.cc", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{!"clang version 4.0.0 (trunk 278098)"}
+!5 = distinct !DISubprogram(name: "call", linkageName: "_Z4callPv", scope: !1, file: !1, line: 29, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!6 = !DILocation(line: 30, column: 32, scope: !5)
+!7 = distinct !DISubprogram(name: "vf", linkageName: "_ZN3vt12vfEv", scope: !1, file: !1, line: 13, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!8 = !{i32 0, !"typeid"}
+
+!9 = distinct !DISubprogram(name: "call1", linkageName: "_Z5call1Pv", scope: !1, file: !1, line: 31, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!10 = !DILocation(line: 35, column: 32, scope: !9)
+!11 = distinct !DISubprogram(name: "vf_empty", linkageName: "_ZN3vt18vf_emptyEv", scope: !1, file: !1, line: 23, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!12 = !{i32 0, !"typeid1"}
+
+!13 = distinct !DISubprogram(name: "call2", linkageName: "_Z5call2Pv", scope: !1, file: !1, line: 40, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!14 = !DILocation(line: 41, column: 32, scope: !13)
+!15 = !{i32 0, !"typeid2"}
+
+!16 = distinct !DISubprogram(name: "call3", linkageName: "_Z5call3Pv", scope: !1, file: !1, line: 50, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!17 = !DILocation(line: 51, column: 32, scope: !16)
+!18 = !{i32 0, !"typeid3"}
+
+
+
+; CHECK: 1 wholeprogramdevirt - Number of whole program devirtualization targets
+; CHECK: 3 wholeprogramdevirt - Number of single implementation devirtualizations
diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
index d8f5c91..8327e1c 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
@@ -11,6 +11,9 @@
; Check wildcard
; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility -pass-remarks=wholeprogramdevirt -wholeprogramdevirt-skip=vf?i1 %s 2>&1 | FileCheck %s --check-prefix=SKIP
+; Check that no stats are reported in speculative devirtualization mode as the virtual const prop is disabled.
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively -stats %s 2>&1 | FileCheck %s --check-prefix=CHECK-SPECULATIVE-WPD
+
target datalayout = "e-p:64:64"
target triple = "x86_64-unknown-linux-gnu"
@@ -225,3 +228,7 @@ declare ptr @llvm.load.relative.i32(ptr, i32)
; CHECK: 2 wholeprogramdevirt - Number of unique return value optimizations
; CHECK: 2 wholeprogramdevirt - Number of virtual constant propagations
; CHECK: 2 wholeprogramdevirt - Number of 1 bit virtual constant propagations
+
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of unique return value optimizations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of virtual constant propagations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of 1 bit virtual constant propagations
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
index 1ffe533..d1df304 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
@@ -1403,8 +1403,8 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpblendvb %xmm3, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpblendw $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpblendw $11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpeqb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpcmpeqb (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpeqd %xmm0, %xmm1, %xmm2
@@ -1415,8 +1415,8 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpcmpeqw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 8 6 3.00 vpcmpestri $1, %xmm0, %xmm2
# CHECK-NEXT: 12 13 3.00 * vpcmpestri $1, (%rax), %xmm2
-# CHECK-NEXT: 7 6 3.00 vpcmpestrm $1, %xmm0, %xmm2
-# CHECK-NEXT: 12 13 3.00 * vpcmpestrm $1, (%rax), %xmm2
+# CHECK-NEXT: 7 7 3.00 vpcmpestrm $1, %xmm0, %xmm2
+# CHECK-NEXT: 12 14 3.00 * vpcmpestrm $1, (%rax), %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpgtb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpcmpgtb (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpgtd %xmm0, %xmm1, %xmm2
@@ -1427,8 +1427,8 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 4 2 2.00 vpcmpistri $1, %xmm0, %xmm2
# CHECK-NEXT: 4 9 2.00 * vpcmpistri $1, (%rax), %xmm2
-# CHECK-NEXT: 3 6 2.00 vpcmpistrm $1, %xmm0, %xmm2
-# CHECK-NEXT: 4 13 2.00 * vpcmpistrm $1, (%rax), %xmm2
+# CHECK-NEXT: 3 7 2.00 vpcmpistrm $1, %xmm0, %xmm2
+# CHECK-NEXT: 4 14 2.00 * vpcmpistrm $1, (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 vperm2f128 $1, %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 10 1.00 * vperm2f128 $1, (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 1 0.50 vpermilpd $1, %xmm0, %xmm2
@@ -1749,7 +1749,7 @@ vzeroupper
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 393.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
+# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 204.25 392.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2126,8 +2126,8 @@ vzeroupper
# CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendvb %xmm3, (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpblendw $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendw $11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpcmpeqb (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqd %xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
index 6dc5bac..6c8fac4 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
@@ -560,14 +560,14 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 3 1.00 vperm2i128 $1, %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 1.00 * vperm2i128 $1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 2 5 1.00 vpermd %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 2 12 2.00 * vpermd (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 2 6 1.00 vpermpd $1, %ymm0, %ymm2
-# CHECK-NEXT: 3 13 2.00 * vpermpd $1, (%rax), %ymm2
-# CHECK-NEXT: 2 7 1.00 vpermps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 3 14 2.00 * vpermps (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 2 6 1.00 vpermq $1, %ymm0, %ymm2
-# CHECK-NEXT: 2 12 2.00 * vpermq $1, (%rax), %ymm2
+# CHECK-NEXT: 1 4 1.00 vpermd %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 11 1.00 * vpermd (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 4 1.00 vpermpd $1, %ymm0, %ymm2
+# CHECK-NEXT: 1 11 1.00 * vpermpd $1, (%rax), %ymm2
+# CHECK-NEXT: 1 4 1.00 vpermps %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 11 1.00 * vpermps (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 1 4 1.00 vpermq $1, %ymm0, %ymm2
+# CHECK-NEXT: 1 11 1.00 * vpermq $1, (%rax), %ymm2
# CHECK-NEXT: 1 5 0.33 * vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2
# CHECK-NEXT: 1 5 0.33 * vpgatherdd %ymm0, (%rax,%ymm1,2), %ymm2
# CHECK-NEXT: 1 5 0.33 * vpgatherdq %xmm0, (%rax,%xmm1,2), %xmm2
@@ -789,7 +789,7 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 6.67 6.67 6.67 - - - - - 93.75 132.75 92.25 36.25 80.50 80.50 29.00 52.33 52.33 52.33 50.67 50.67 50.67 2.50 2.50
+# CHECK-NEXT: 6.67 6.67 6.67 - - - - - 93.75 128.75 92.25 36.25 80.50 80.50 29.00 52.33 52.33 52.33 50.67 50.67 50.67 2.50 2.50
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -894,13 +894,13 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vperm2i128 $1, %ymm0, %ymm1, %ymm2
# CHECK-NEXT: - - - - - - - - - 1.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vperm2i128 $1, (%rax), %ymm1, %ymm2
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermd %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermd (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermd (%rax), %ymm1, %ymm2
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermpd $1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermpd $1, (%rax), %ymm2
+# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermpd $1, (%rax), %ymm2
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermps %ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermps (%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermps (%rax), %ymm1, %ymm2
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - - - - - - - - - - vpermq $1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 2.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq $1, (%rax), %ymm2
+# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - 1.00 - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpermq $1, (%rax), %ymm2
# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdd %xmm0, (%rax,%xmm1,2), %xmm2
# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdd %ymm0, (%rax,%ymm1,2), %ymm2
# CHECK-NEXT: 0.33 0.33 0.33 - - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpgatherdq %xmm0, (%rax,%xmm1,2), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
index 72d7de3..14b8e5f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
@@ -1207,7 +1207,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
# CHECK-NEXT: 1 3 1.00 vaddps %zmm16, %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT: 1 10 1.00 * vaddps (%rax), %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT: 1 10 1.00 * vaddps (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT: 1 4 0.50 valignd $1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1 2 0.50 valignd $1, %zmm16, %zmm17, %zmm19
# CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax), %zmm17, %zmm19
# CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax){1to16}, %zmm17, %zmm19
# CHECK-NEXT: 1 1 1.00 valignd $1, %zmm16, %zmm17, %zmm19 {%k1}
@@ -1216,7 +1216,7 @@ vunpcklps (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
# CHECK-NEXT: 1 1 1.00 valignd $1, %zmm16, %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax), %zmm17, %zmm19 {%k1} {z}
# CHECK-NEXT: 1 8 1.00 * valignd $1, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT: 1 4 0.50 valignq $1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1 2 0.50 valignq $1, %zmm16, %zmm17, %zmm19
# CHECK-NEXT: 1 8 1.00 * valignq $1, (%rax), %zmm17, %zmm19
# CHECK-NEXT: 1 8 1.00 * valignq $1, (%rax){1to8}, %zmm17, %zmm19
# CHECK-NEXT: 1 1 1.00 valignq $1, %zmm16, %zmm17, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
index 552b3e4..ead609e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
@@ -1948,7 +1948,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT: 1 3 0.50 vaddps %ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 10 0.50 * vaddps (%rax), %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 10 0.50 * vaddps (%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT: 1 4 0.50 valignd $1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 1 3 0.50 valignd $1, %xmm16, %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to4}, %xmm17, %xmm19
# CHECK-NEXT: 1 1 0.50 valignd $1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1957,7 +1957,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT: 1 1 0.50 valignd $1, %xmm16, %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT: 1 4 0.50 valignd $1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 1 4 1.00 valignd $1, %ymm16, %ymm17, %ymm19
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %ymm17, %ymm19
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to8}, %ymm17, %ymm19
# CHECK-NEXT: 1 1 0.50 valignd $1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -1966,7 +1966,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT: 1 1 0.50 valignd $1, %ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax), %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * valignd $1, (%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT: 1 4 0.50 valignq $1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 1 3 0.50 valignq $1, %xmm16, %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to2}, %xmm17, %xmm19
# CHECK-NEXT: 1 1 0.50 valignq $1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1975,7 +1975,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT: 1 1 0.50 valignq $1, %xmm16, %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT: 1 4 0.50 valignq $1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 1 4 1.00 valignq $1, %ymm16, %ymm17, %ymm19
# CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax), %ymm17, %ymm19
# CHECK-NEXT: 1 8 0.50 * valignq $1, (%rax){1to4}, %ymm17, %ymm19
# CHECK-NEXT: 1 1 0.50 valignq $1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3614,7 +3614,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 10.67 10.67 10.67 - - - - - 208.00 1083.00 636.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00 32.00
+# CHECK-NEXT: 10.67 10.67 10.67 - - - - - 208.00 1084.00 637.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00 32.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -3663,7 +3663,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %xmm16, %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax), %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax), %ymm17, %ymm19
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignd $1, (%rax){1to8}, %ymm17, %ymm19
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignd $1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3681,7 +3681,7 @@ vunpcklps (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %xmm16, %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax), %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax), %ymm17, %ymm19
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - valignq $1, (%rax){1to4}, %ymm17, %ymm19
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - valignq $1, %ymm16, %ymm17, %ymm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
index 87ba060..d1f2a98 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq $11, (%rax), %zmm17, %zmm19
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %zmm17, %zmm19
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq $11, (%rax), %zmm17, %zmm19
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - -
+# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
index 3c80c56..ea7a280 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
@@ -16,10 +16,10 @@ vpclmulqdq $11, (%rax), %ymm17, %ymm19
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %ymm17, %ymm19
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn4AGU0
@@ -48,11 +48,11 @@ vpclmulqdq $11, (%rax), %ymm17, %ymm19
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: - - - - - - - - 8.00 8.00 - - 1.00 1.00 - 0.67 0.67 0.67 0.67 0.67 0.67 - -
+# CHECK-NEXT: - - - - - - - - 6.00 6.00 - - 1.00 1.00 - 0.67 0.67 0.67 0.67 0.67 0.67 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm17, %ymm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
index f4888cf..afbd566 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
@@ -69,12 +69,12 @@ tzcnt (%rax), %rcx
# CHECK-NEXT: 2 5 0.33 * blsrl (%rax), %ecx
# CHECK-NEXT: 1 1 0.25 blsrq %rax, %rcx
# CHECK-NEXT: 2 5 0.33 * blsrq (%rax), %rcx
-# CHECK-NEXT: 2 2 1.00 tzcntw %ax, %cx
-# CHECK-NEXT: 2 6 0.50 * tzcntw (%rax), %cx
-# CHECK-NEXT: 2 2 0.50 tzcntl %eax, %ecx
-# CHECK-NEXT: 2 6 0.50 * tzcntl (%rax), %ecx
-# CHECK-NEXT: 2 2 0.50 tzcntq %rax, %rcx
-# CHECK-NEXT: 2 6 0.50 * tzcntq (%rax), %rcx
+# CHECK-NEXT: 1 1 0.25 tzcntw %ax, %cx
+# CHECK-NEXT: 1 5 0.50 * tzcntw (%rax), %cx
+# CHECK-NEXT: 1 1 0.50 tzcntl %eax, %ecx
+# CHECK-NEXT: 1 5 0.50 * tzcntl (%rax), %ecx
+# CHECK-NEXT: 1 1 0.50 tzcntq %rax, %rcx
+# CHECK-NEXT: 1 5 0.50 * tzcntq (%rax), %rcx
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn4AGU0
@@ -103,7 +103,7 @@ tzcnt (%rax), %rcx
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 4.33 4.33 4.33 5.00 9.50 9.50 5.00 - - - - - - - - 4.33 4.33 4.33 4.33 4.33 4.33 - -
+# CHECK-NEXT: 4.33 4.33 4.33 4.25 8.75 8.75 4.25 - - - - - - - - 4.33 4.33 4.33 4.33 4.33 4.33 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -127,7 +127,7 @@ tzcnt (%rax), %rcx
# CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - blsrl (%rax), %ecx
# CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - blsrq %rax, %rcx
# CHECK-NEXT: 0.33 0.33 0.33 0.25 0.25 0.25 0.25 - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - blsrq (%rax), %rcx
-# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - - - - - - - - - - - tzcntw %ax, %cx
+# CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - tzcntw %ax, %cx
# CHECK-NEXT: 0.33 0.33 0.33 - 0.50 0.50 - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - tzcntw (%rax), %cx
# CHECK-NEXT: - - - - 0.50 0.50 - - - - - - - - - - - - - - - - - tzcntl %eax, %ecx
# CHECK-NEXT: 0.33 0.33 0.33 - 0.50 0.50 - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - tzcntl (%rax), %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
index 64feeaf..26a42fd 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
@@ -15,10 +15,10 @@ lock cmpxchg16b (%rax)
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 19 3 6.00 * * cmpxchg8b (%rax)
-# CHECK-NEXT: 28 4 14.75 * * cmpxchg16b (%rax)
-# CHECK-NEXT: 19 3 6.00 * * lock cmpxchg8b (%rax)
-# CHECK-NEXT: 28 4 14.75 * * lock cmpxchg16b (%rax)
+# CHECK-NEXT: 15 3 5.00 * * cmpxchg8b (%rax)
+# CHECK-NEXT: 26 2 10.00 * * cmpxchg16b (%rax)
+# CHECK-NEXT: 15 3 5.00 * * lock cmpxchg8b (%rax)
+# CHECK-NEXT: 26 2 10.00 * * lock cmpxchg16b (%rax)
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn4AGU0
@@ -47,11 +47,11 @@ lock cmpxchg16b (%rax)
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: - - - 41.50 41.50 41.50 41.50 - - - - - - - - - - - - - - - -
+# CHECK-NEXT: - - - 30.00 30.00 30.00 30.00 - - - - - - - - - - - - - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT: - - - 6.00 6.00 6.00 6.00 - - - - - - - - - - - - - - - - cmpxchg8b (%rax)
-# CHECK-NEXT: - - - 14.75 14.75 14.75 14.75 - - - - - - - - - - - - - - - - cmpxchg16b (%rax)
-# CHECK-NEXT: - - - 6.00 6.00 6.00 6.00 - - - - - - - - - - - - - - - - lock cmpxchg8b (%rax)
-# CHECK-NEXT: - - - 14.75 14.75 14.75 14.75 - - - - - - - - - - - - - - - - lock cmpxchg16b (%rax)
+# CHECK-NEXT: - - - 5.00 5.00 5.00 5.00 - - - - - - - - - - - - - - - - cmpxchg8b (%rax)
+# CHECK-NEXT: - - - 10.00 10.00 10.00 10.00 - - - - - - - - - - - - - - - - cmpxchg16b (%rax)
+# CHECK-NEXT: - - - 5.00 5.00 5.00 5.00 - - - - - - - - - - - - - - - - lock cmpxchg8b (%rax)
+# CHECK-NEXT: - - - 10.00 10.00 10.00 10.00 - - - - - - - - - - - - - - - - lock cmpxchg16b (%rax)
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
index a36fb2aa..fc2bc8e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
@@ -13,8 +13,8 @@ pclmulqdq $11, (%rax), %xmm2
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 4 4 2.00 pclmulqdq $11, %xmm0, %xmm2
-# CHECK-NEXT: 4 11 2.00 * pclmulqdq $11, (%rax), %xmm2
+# CHECK-NEXT: 4 4 1.50 pclmulqdq $11, %xmm0, %xmm2
+# CHECK-NEXT: 4 11 1.50 * pclmulqdq $11, (%rax), %xmm2
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn4AGU0
@@ -43,9 +43,9 @@ pclmulqdq $11, (%rax), %xmm2
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - -
+# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - pclmulqdq $11, %xmm0, %xmm2
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - pclmulqdq $11, (%rax), %xmm2
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - pclmulqdq $11, %xmm0, %xmm2
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - pclmulqdq $11, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
index 015d37e..ae60835 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
@@ -52,12 +52,12 @@ pcmpgtq (%rax), %xmm2
# CHECK-NEXT: 1 7 1.00 * crc32q (%rax), %rcx
# CHECK-NEXT: 8 6 3.00 pcmpestri $1, %xmm0, %xmm2
# CHECK-NEXT: 12 13 3.00 * pcmpestri $1, (%rax), %xmm2
-# CHECK-NEXT: 7 6 3.00 pcmpestrm $1, %xmm0, %xmm2
-# CHECK-NEXT: 12 13 3.00 * pcmpestrm $1, (%rax), %xmm2
+# CHECK-NEXT: 7 7 3.00 pcmpestrm $1, %xmm0, %xmm2
+# CHECK-NEXT: 12 14 3.00 * pcmpestrm $1, (%rax), %xmm2
# CHECK-NEXT: 4 2 2.00 pcmpistri $1, %xmm0, %xmm2
# CHECK-NEXT: 4 9 2.00 * pcmpistri $1, (%rax), %xmm2
-# CHECK-NEXT: 3 6 2.00 pcmpistrm $1, %xmm0, %xmm2
-# CHECK-NEXT: 4 13 2.00 * pcmpistrm $1, (%rax), %xmm2
+# CHECK-NEXT: 3 7 2.00 pcmpistrm $1, %xmm0, %xmm2
+# CHECK-NEXT: 4 14 2.00 * pcmpistrm $1, (%rax), %xmm2
# CHECK-NEXT: 1 1 0.25 pcmpgtq %xmm0, %xmm2
# CHECK-NEXT: 1 8 0.50 * pcmpgtq (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
index 55a36d0..dca4703 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq $11, (%rax), %ymm1, %ymm3
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %ymm1, %ymm3
# CHECK: Resources:
# CHECK-NEXT: [0] - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq $11, (%rax), %ymm1, %ymm3
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: - - - - - - - - 4.00 4.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - -
+# CHECK-NEXT: - - - - - - - - 3.00 3.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %ymm1, %ymm3
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
index 9c5b4e4..886d9c6 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
@@ -1173,18 +1173,18 @@ xorq (%rax), %rdi
# CHECK-NEXT: 1 6 0.67 * * andq %rsi, (%rax)
# CHECK-NEXT: 1 6 0.67 * * lock andq %rsi, (%rax)
# CHECK-NEXT: 1 5 0.33 * andq (%rax), %rdi
-# CHECK-NEXT: 6 1 1.00 bsfw %si, %di
-# CHECK-NEXT: 6 1 1.00 bsrw %si, %di
-# CHECK-NEXT: 7 5 1.00 * bsfw (%rax), %di
-# CHECK-NEXT: 7 5 1.00 * bsrw (%rax), %di
-# CHECK-NEXT: 6 1 1.00 bsfl %esi, %edi
-# CHECK-NEXT: 6 1 1.00 bsrl %esi, %edi
-# CHECK-NEXT: 7 5 1.00 * bsfl (%rax), %edi
-# CHECK-NEXT: 7 5 1.00 * bsrl (%rax), %edi
-# CHECK-NEXT: 6 1 1.00 bsfq %rsi, %rdi
-# CHECK-NEXT: 6 1 1.00 bsrq %rsi, %rdi
-# CHECK-NEXT: 7 5 1.00 * bsfq (%rax), %rdi
-# CHECK-NEXT: 7 5 1.00 * bsrq (%rax), %rdi
+# CHECK-NEXT: 1 1 1.00 bsfw %si, %di
+# CHECK-NEXT: 1 1 1.00 bsrw %si, %di
+# CHECK-NEXT: 2 5 1.00 * bsfw (%rax), %di
+# CHECK-NEXT: 2 5 1.00 * bsrw (%rax), %di
+# CHECK-NEXT: 1 1 1.00 bsfl %esi, %edi
+# CHECK-NEXT: 1 1 1.00 bsrl %esi, %edi
+# CHECK-NEXT: 2 5 1.00 * bsfl (%rax), %edi
+# CHECK-NEXT: 2 5 1.00 * bsrl (%rax), %edi
+# CHECK-NEXT: 1 1 1.00 bsfq %rsi, %rdi
+# CHECK-NEXT: 1 1 1.00 bsrq %rsi, %rdi
+# CHECK-NEXT: 2 5 1.00 * bsfq (%rax), %rdi
+# CHECK-NEXT: 2 5 1.00 * bsrq (%rax), %rdi
# CHECK-NEXT: 1 1 0.25 bswapl %eax
# CHECK-NEXT: 1 1 0.25 bswapq %rax
# CHECK-NEXT: 1 1 0.50 btw %si, %di
@@ -1321,23 +1321,23 @@ xorq (%rax), %rdi
# CHECK-NEXT: 1 1 0.25 decq %rdi
# CHECK-NEXT: 1 6 0.67 * * decq (%rax)
# CHECK-NEXT: 1 6 0.67 * * lock decq (%rax)
-# CHECK-NEXT: 2 10 10.00 U divb %dil
-# CHECK-NEXT: 2 14 10.00 * U divb (%rax)
-# CHECK-NEXT: 2 11 11.00 U divw %si
-# CHECK-NEXT: 2 15 11.00 * U divw (%rax)
-# CHECK-NEXT: 2 13 13.00 U divl %edx
-# CHECK-NEXT: 2 17 13.00 * U divl (%rax)
-# CHECK-NEXT: 2 17 17.00 U divq %rcx
-# CHECK-NEXT: 2 21 17.00 * U divq (%rax)
+# CHECK-NEXT: 2 9 9.00 U divb %dil
+# CHECK-NEXT: 2 13 9.00 * U divb (%rax)
+# CHECK-NEXT: 2 10 10.00 U divw %si
+# CHECK-NEXT: 2 14 10.00 * U divw (%rax)
+# CHECK-NEXT: 2 12 12.00 U divl %edx
+# CHECK-NEXT: 2 16 12.00 * U divl (%rax)
+# CHECK-NEXT: 2 18 18.00 U divq %rcx
+# CHECK-NEXT: 2 22 18.00 * U divq (%rax)
# CHECK-NEXT: 100 100 25.00 U enter $7, $4095
-# CHECK-NEXT: 2 10 10.00 U idivb %dil
-# CHECK-NEXT: 2 14 10.00 * U idivb (%rax)
-# CHECK-NEXT: 2 11 11.00 U idivw %si
-# CHECK-NEXT: 2 15 11.00 * U idivw (%rax)
-# CHECK-NEXT: 2 13 13.00 U idivl %edx
-# CHECK-NEXT: 2 17 13.00 * U idivl (%rax)
-# CHECK-NEXT: 2 17 17.00 U idivq %rcx
-# CHECK-NEXT: 2 21 17.00 * U idivq (%rax)
+# CHECK-NEXT: 2 9 9.00 U idivb %dil
+# CHECK-NEXT: 2 13 9.00 * U idivb (%rax)
+# CHECK-NEXT: 2 10 10.00 U idivw %si
+# CHECK-NEXT: 2 14 10.00 * U idivw (%rax)
+# CHECK-NEXT: 2 12 12.00 U idivl %edx
+# CHECK-NEXT: 2 16 12.00 * U idivl (%rax)
+# CHECK-NEXT: 2 18 18.00 U idivq %rcx
+# CHECK-NEXT: 2 22 18.00 * U idivq (%rax)
# CHECK-NEXT: 1 3 3.00 imulb %dil
# CHECK-NEXT: 1 7 3.00 * imulb (%rax)
# CHECK-NEXT: 3 3 3.00 imulw %di
@@ -1891,12 +1891,12 @@ xorq (%rax), %rdi
# CHECK-NEXT: 1 5 0.67 * * xaddq %rax, (%rbx)
# CHECK-NEXT: 1 5 0.67 * * lock xaddq %rax, (%rbx)
# CHECK-NEXT: 2 1 0.50 xchgb %bl, %cl
-# CHECK-NEXT: 5 7 0.50 * * xchgb %bl, (%rbx)
-# CHECK-NEXT: 5 7 0.50 * * lock xchgb %bl, (%rbx)
+# CHECK-NEXT: 2 7 0.50 * * xchgb %bl, (%rbx)
+# CHECK-NEXT: 2 7 0.50 * * lock xchgb %bl, (%rbx)
# CHECK-NEXT: 2 1 0.50 xchgw %bx, %ax
# CHECK-NEXT: 2 1 0.50 xchgw %bx, %cx
-# CHECK-NEXT: 5 7 0.50 * * xchgw %ax, (%rbx)
-# CHECK-NEXT: 5 7 0.50 * * lock xchgw %ax, (%rbx)
+# CHECK-NEXT: 2 7 0.50 * * xchgw %ax, (%rbx)
+# CHECK-NEXT: 2 7 0.50 * * lock xchgw %ax, (%rbx)
# CHECK-NEXT: 2 0 0.33 xchgl %ebx, %eax
# CHECK-NEXT: 2 0 0.33 xchgl %ebx, %ecx
# CHECK-NEXT: 2 6 0.50 * * xchgl %eax, (%rbx)
@@ -1975,7 +1975,7 @@ xorq (%rax), %rdi
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 259.00 259.00 259.00 1733.00 1865.50 1775.50 1529.50 1.50 - - - - - - - 259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
+# CHECK-NEXT: 259.00 259.00 259.00 1725.00 1865.50 1775.50 1529.50 1.50 - - - - - - - 259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2266,23 +2266,23 @@ xorq (%rax), %rdi
# CHECK-NEXT: - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - - - - - - decq %rdi
# CHECK-NEXT: 0.67 0.67 0.67 0.25 0.25 0.25 0.25 - - - - - - - - 0.67 0.67 0.67 0.33 0.33 0.33 0.50 0.50 decq (%rax)
# CHECK-NEXT: 0.67 0.67 0.67 0.25 0.25 0.25 0.25 - - - - - - - - 0.67 0.67 0.67 0.33 0.33 0.33 0.50 0.50 lock decq (%rax)
-# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - divb %dil
-# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divb (%rax)
-# CHECK-NEXT: - - - 11.00 - - - - - - - - - - - - - - - - - - - divw %si
-# CHECK-NEXT: 0.33 0.33 0.33 11.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divw (%rax)
-# CHECK-NEXT: - - - 13.00 - - - - - - - - - - - - - - - - - - - divl %edx
-# CHECK-NEXT: 0.33 0.33 0.33 13.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divl (%rax)
-# CHECK-NEXT: - - - 17.00 - - - - - - - - - - - - - - - - - - - divq %rcx
-# CHECK-NEXT: 0.33 0.33 0.33 17.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divq (%rax)
+# CHECK-NEXT: - - - 9.00 - - - - - - - - - - - - - - - - - - - divb %dil
+# CHECK-NEXT: 0.33 0.33 0.33 9.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divb (%rax)
+# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - divw %si
+# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divw (%rax)
+# CHECK-NEXT: - - - 12.00 - - - - - - - - - - - - - - - - - - - divl %edx
+# CHECK-NEXT: 0.33 0.33 0.33 12.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divl (%rax)
+# CHECK-NEXT: - - - 18.00 - - - - - - - - - - - - - - - - - - - divq %rcx
+# CHECK-NEXT: 0.33 0.33 0.33 18.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - divq (%rax)
# CHECK-NEXT: - - - 25.00 25.00 25.00 25.00 - - - - - - - - - - - - - - - - enter $7, $4095
-# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - idivb %dil
-# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivb (%rax)
-# CHECK-NEXT: - - - 11.00 - - - - - - - - - - - - - - - - - - - idivw %si
-# CHECK-NEXT: 0.33 0.33 0.33 11.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivw (%rax)
-# CHECK-NEXT: - - - 13.00 - - - - - - - - - - - - - - - - - - - idivl %edx
-# CHECK-NEXT: 0.33 0.33 0.33 13.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivl (%rax)
-# CHECK-NEXT: - - - 17.00 - - - - - - - - - - - - - - - - - - - idivq %rcx
-# CHECK-NEXT: 0.33 0.33 0.33 17.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivq (%rax)
+# CHECK-NEXT: - - - 9.00 - - - - - - - - - - - - - - - - - - - idivb %dil
+# CHECK-NEXT: 0.33 0.33 0.33 9.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivb (%rax)
+# CHECK-NEXT: - - - 10.00 - - - - - - - - - - - - - - - - - - - idivw %si
+# CHECK-NEXT: 0.33 0.33 0.33 10.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivw (%rax)
+# CHECK-NEXT: - - - 12.00 - - - - - - - - - - - - - - - - - - - idivl %edx
+# CHECK-NEXT: 0.33 0.33 0.33 12.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivl (%rax)
+# CHECK-NEXT: - - - 18.00 - - - - - - - - - - - - - - - - - - - idivq %rcx
+# CHECK-NEXT: 0.33 0.33 0.33 18.00 - - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - idivq (%rax)
# CHECK-NEXT: - - - - 3.00 - - - - - - - - - - - - - - - - - - imulb %dil
# CHECK-NEXT: 0.33 0.33 0.33 - 3.00 - - - - - - - - - - 0.33 0.33 0.33 0.33 0.33 0.33 - - imulb (%rax)
# CHECK-NEXT: - - - - 3.00 - - - - - - - - - - - - - - - - - - imulw %di
diff --git a/llvm/unittests/AsmParser/AsmParserTest.cpp b/llvm/unittests/AsmParser/AsmParserTest.cpp
index ce22670..898a829 100644
--- a/llvm/unittests/AsmParser/AsmParserTest.cpp
+++ b/llvm/unittests/AsmParser/AsmParserTest.cpp
@@ -6,7 +6,9 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/AsmParser/SlotMapping.h"
#include "llvm/IR/Constants.h"
@@ -14,10 +16,14 @@
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/SourceMgr.h"
#include "gtest/gtest.h"
+#define DEBUG_TYPE "unittest-asm-parser-tests"
+
using namespace llvm;
namespace {
@@ -479,4 +485,53 @@ TEST(AsmParserTest, DIExpressionBodyAtBeginningWithSlotMappingParsing) {
ASSERT_EQ(Mapping.MetadataNodes.size(), 0u);
}
+#define ASSERT_EQ_LOC(Loc1, Loc2) \
+ do { \
+ EXPECT_TRUE(Loc1.contains(Loc2) && Loc2.contains(Loc1)) \
+ << #Loc1 " location: " << Loc1.Start.Line << ":" << Loc1.Start.Col \
+ << " - " << Loc1.End.Line << ":" << Loc1.End.Col << "\n" \
+ << #Loc2 " location: " << Loc2.Start.Line << ":" << Loc2.Start.Col \
+ << " - " << Loc2.End.Line << ":" << Loc2.End.Col << "\n"; \
+ } while (false)
+
+TEST(AsmParserTest, ParserObjectLocations) {
+ StringRef Source = "define i32 @main() {\n"
+ "entry:\n"
+ " %a = add i32 1, 2\n"
+ " ret i32 %a\n"
+ "}\n";
+ LLVMContext Ctx;
+ SMDiagnostic Error;
+ SlotMapping Mapping;
+ AsmParserContext ParserContext;
+ auto Mod = parseAssemblyString(Source, Error, Ctx, &Mapping, &ParserContext);
+
+ auto *MainFn = Mod->getFunction("main");
+ ASSERT_TRUE(MainFn != nullptr);
+
+ auto MaybeMainLoc = ParserContext.getFunctionLocation(MainFn);
+ EXPECT_TRUE(MaybeMainLoc.has_value());
+ auto MainLoc = MaybeMainLoc.value();
+ auto ExpectedMainLoc = FileLocRange(FileLoc{0, 0}, FileLoc{4, 1});
+ ASSERT_EQ_LOC(MainLoc, ExpectedMainLoc);
+
+ auto &EntryBB = MainFn->getEntryBlock();
+ auto MaybeEntryBBLoc = ParserContext.getBlockLocation(&EntryBB);
+ ASSERT_TRUE(MaybeEntryBBLoc.has_value());
+ auto EntryBBLoc = MaybeEntryBBLoc.value();
+ auto ExpectedEntryBBLoc = FileLocRange(FileLoc{1, 0}, FileLoc{3, 14});
+ ASSERT_EQ_LOC(EntryBBLoc, ExpectedEntryBBLoc);
+
+ SmallVector<FileLocRange> InstructionLocations = {
+ FileLocRange(FileLoc{2, 4}, FileLoc{2, 21}),
+ FileLocRange(FileLoc{3, 4}, FileLoc{3, 14})};
+
+ for (const auto &[Inst, ExpectedLoc] : zip(EntryBB, InstructionLocations)) {
+ auto MaybeInstLoc = ParserContext.getInstructionLocation(&Inst);
+ ASSERT_TRUE(MaybeMainLoc.has_value());
+ auto InstLoc = MaybeInstLoc.value();
+ ASSERT_EQ_LOC(InstLoc, ExpectedLoc);
+ }
+}
+
} // end anonymous namespace
diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
index 6c08173..af2d56d 100644
--- a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
+++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
@@ -383,14 +383,14 @@ class AsmPrinterHandlerTest : public AsmPrinterFixtureBase {
public:
TestHandler(AsmPrinterHandlerTest &Test) : Test(Test) {}
- virtual ~TestHandler() {}
- virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
- virtual void beginModule(Module *M) override { Test.BeginCount++; }
- virtual void endModule() override { Test.EndCount++; }
- virtual void beginFunction(const MachineFunction *MF) override {}
- virtual void endFunction(const MachineFunction *MF) override {}
- virtual void beginInstruction(const MachineInstr *MI) override {}
- virtual void endInstruction() override {}
+ ~TestHandler() override {}
+ void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+ void beginModule(Module *M) override { Test.BeginCount++; }
+ void endModule() override { Test.EndCount++; }
+ void beginFunction(const MachineFunction *MF) override {}
+ void endFunction(const MachineFunction *MF) override {}
+ void beginInstruction(const MachineInstr *MI) override {}
+ void endInstruction() override {}
};
protected:
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index ce2a38b..ff87e7b 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -69,7 +69,7 @@ public:
InstrRefLDVTest() : Ctx(), Mod(std::make_unique<Module>("beehives", Ctx)) {}
- void SetUp() {
+ void SetUp() override {
// Boilerplate that creates a MachineFunction and associated blocks.
Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
diff --git a/llvm/unittests/CodeGen/MIR2VecTest.cpp b/llvm/unittests/CodeGen/MIR2VecTest.cpp
index 8710d6b..d42749c 100644
--- a/llvm/unittests/CodeGen/MIR2VecTest.cpp
+++ b/llvm/unittests/CodeGen/MIR2VecTest.cpp
@@ -54,6 +54,9 @@ protected:
std::unique_ptr<Module> M;
std::unique_ptr<TargetMachine> TM;
const TargetInstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ std::unique_ptr<MachineModuleInfo> MMI;
+ MachineFunction *MF = nullptr;
static void SetUpTestCase() {
InitializeAllTargets();
@@ -90,15 +93,24 @@ protected:
Function *F =
Function::Create(FT, Function::ExternalLinkage, "test", M.get());
- // Get the target instruction info
+ // Create MMI and MF to get TRI and MRI
+ MMI = std::make_unique<MachineModuleInfo>(TM.get());
+ MF = &MMI->getOrCreateMachineFunction(*F);
+
+ // Get the target instruction info and register info
TII = TM->getSubtargetImpl(*F)->getInstrInfo();
- if (!TII) {
- GTEST_SKIP() << "Failed to get target instruction info; Skipping test";
+ TRI = TM->getSubtargetImpl(*F)->getRegisterInfo();
+ if (!TII || !TRI) {
+ GTEST_SKIP()
+ << "Failed to get target instruction/register info; Skipping test";
return;
}
}
- void TearDown() override { TII = nullptr; }
+ void TearDown() override {
+ TII = nullptr;
+ TRI = nullptr;
+ }
// Find an opcode by name
int findOpcodeByName(StringRef Name) {
@@ -110,17 +122,94 @@ protected:
}
// Create a vocabulary with specific opcodes and embeddings
- Expected<MIRVocabulary>
- createTestVocab(std::initializer_list<std::pair<const char *, float>> opcodes,
- unsigned dimension = 2) {
- assert(TII && "TargetInstrInfo not initialized");
- VocabMap VMap;
- for (const auto &[name, value] : opcodes)
- VMap[name] = Embedding(dimension, value);
- return MIRVocabulary::create(std::move(VMap), *TII);
+ // This might cause errors in future when the validation in
+ // MIRVocabulary::generateStorage() enforces hard checks on the vocabulary
+ // entries.
+ Expected<MIRVocabulary> createTestVocab(
+ std::initializer_list<std::pair<const char *, float>> Opcodes,
+ std::initializer_list<std::pair<const char *, float>> CommonOperands,
+ std::initializer_list<std::pair<const char *, float>> PhyRegs,
+ std::initializer_list<std::pair<const char *, float>> VirtRegs,
+ unsigned Dimension = 2) {
+ assert(TII && TRI && MF && "Target info not initialized");
+ VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+ for (const auto &[Name, Value] : Opcodes)
+ OpcodeMap[Name] = Embedding(Dimension, Value);
+
+ for (const auto &[Name, Value] : CommonOperands)
+ CommonOperandMap[Name] = Embedding(Dimension, Value);
+
+ for (const auto &[Name, Value] : PhyRegs)
+ PhyRegMap[Name] = Embedding(Dimension, Value);
+
+ for (const auto &[Name, Value] : VirtRegs)
+ VirtRegMap[Name] = Embedding(Dimension, Value);
+
+ // If any section is empty, create minimal maps for other vocabulary
+ // sections to satisfy validation
+ if (Opcodes.size() == 0)
+ OpcodeMap["NOOP"] = Embedding(Dimension, 0.0f);
+ if (CommonOperands.size() == 0)
+ CommonOperandMap["Immediate"] = Embedding(Dimension, 0.0f);
+ if (PhyRegs.size() == 0)
+ PhyRegMap["GR32"] = Embedding(Dimension, 0.0f);
+ if (VirtRegs.size() == 0)
+ VirtRegMap["GR32"] = Embedding(Dimension, 0.0f);
+
+ return MIRVocabulary::create(
+ std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+ std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
}
};
+// Parameterized test for empty vocab sections
+class MIR2VecVocabEmptySectionTestFixture
+ : public MIR2VecVocabTestFixture,
+ public ::testing::WithParamInterface<int> {
+protected:
+ void SetUp() override {
+ MIR2VecVocabTestFixture::SetUp();
+ // If base class setup was skipped (TII not initialized), skip derived setup
+ if (!TII)
+ GTEST_SKIP() << "Failed to get target instruction info in "
+ "the base class setup; Skipping test";
+ }
+};
+
+TEST_P(MIR2VecVocabEmptySectionTestFixture, EmptySectionFailsValidation) {
+ int EmptySection = GetParam();
+ VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+
+ if (EmptySection != 0)
+ OpcodeMap["ADD"] = Embedding(2, 1.0f);
+ if (EmptySection != 1)
+ CommonOperandMap["Immediate"] = Embedding(2, 0.0f);
+ if (EmptySection != 2)
+ PhyRegMap["GR32"] = Embedding(2, 0.0f);
+ if (EmptySection != 3)
+ VirtRegMap["GR32"] = Embedding(2, 0.0f);
+
+ ASSERT_TRUE(TII != nullptr);
+ ASSERT_TRUE(TRI != nullptr);
+ ASSERT_TRUE(MF != nullptr);
+
+ auto VocabOrErr = MIRVocabulary::create(
+ std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+ std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
+ EXPECT_FALSE(static_cast<bool>(VocabOrErr))
+ << "Factory method should fail when section " << EmptySection
+ << " is empty";
+
+ if (!VocabOrErr) {
+ auto Err = VocabOrErr.takeError();
+ std::string ErrorMsg = toString(std::move(Err));
+ EXPECT_FALSE(ErrorMsg.empty());
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(EmptySection, MIR2VecVocabEmptySectionTestFixture,
+ ::testing::Values(0, 1, 2, 3));
+
TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
// Test that same base opcodes get same canonical indices
std::string BaseName1 = MIRVocabulary::extractBaseOpcodeName("ADD16ri");
@@ -133,7 +222,7 @@ TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
// Create a MIRVocabulary instance to test the mapping
// Use a minimal MIRVocabulary to trigger canonical mapping construction
Embedding Val = Embedding(64, 1.0f);
- auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
+ auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64);
ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
<< "Failed to create vocabulary: "
<< toString(TestVocabOrErr.takeError());
@@ -190,7 +279,7 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
// Create a MIRVocabulary instance to test deterministic mapping
// Use a minimal MIRVocabulary to trigger canonical mapping construction
- auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
+ auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64);
ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
<< "Failed to create vocabulary: "
<< toString(TestVocabOrErr.takeError());
@@ -210,7 +299,8 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
// Test MIRVocabulary construction
TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
- auto VocabOrErr = createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, 128);
+ auto VocabOrErr =
+ createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 128);
ASSERT_TRUE(static_cast<bool>(VocabOrErr))
<< "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
auto &Vocab = *VocabOrErr;
@@ -231,42 +321,15 @@ TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
EXPECT_GT(Count, 0u);
}
-// Test factory method with empty vocabulary
-TEST_F(MIR2VecVocabTestFixture, EmptyVocabularyCreation) {
- VocabMap EmptyVMap;
-
- auto VocabOrErr = MIRVocabulary::create(std::move(EmptyVMap), *TII);
- EXPECT_FALSE(static_cast<bool>(VocabOrErr))
- << "Factory method should fail with empty vocabulary";
-
- // Consume the error
- if (!VocabOrErr) {
- auto Err = VocabOrErr.takeError();
- std::string ErrorMsg = toString(std::move(Err));
- EXPECT_FALSE(ErrorMsg.empty());
- }
-}
-
// Fixture for embedding related tests
class MIR2VecEmbeddingTestFixture : public MIR2VecVocabTestFixture {
protected:
- std::unique_ptr<MachineModuleInfo> MMI;
- MachineFunction *MF = nullptr;
-
void SetUp() override {
MIR2VecVocabTestFixture::SetUp();
// If base class setup was skipped (TII not initialized), skip derived setup
if (!TII)
GTEST_SKIP() << "Failed to get target instruction info in "
"the base class setup; Skipping test";
-
- // Create a dummy function for MachineFunction
- FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false);
- Function *F =
- Function::Create(FT, Function::ExternalLinkage, "test", M.get());
-
- MMI = std::make_unique<MachineModuleInfo>(TM.get());
- MF = &MMI->getOrCreateMachineFunction(*F);
}
void TearDown() override { MIR2VecVocabTestFixture::TearDown(); }
@@ -298,7 +361,8 @@ protected:
// Test factory method for creating embedder
TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) {
- auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+ auto VocabOrErr =
+ MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1);
ASSERT_TRUE(static_cast<bool>(VocabOrErr))
<< "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
auto &V = *VocabOrErr;
@@ -307,7 +371,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) {
}
TEST_F(MIR2VecEmbeddingTestFixture, CreateInvalidMode) {
- auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+ auto VocabOrErr =
+ MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1);
ASSERT_TRUE(static_cast<bool>(VocabOrErr))
<< "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
auto &V = *VocabOrErr;
@@ -324,7 +389,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) {
{"RET", 2.0f}, // [2.0, 2.0, 2.0, 2.0]
{"TRAP", 3.0f} // [3.0, 3.0, 3.0, 3.0]
},
- 4);
+ {}, {}, {}, 4);
ASSERT_TRUE(static_cast<bool>(VocabOrErr))
<< "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
auto &Vocab = *VocabOrErr;
@@ -378,7 +443,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) {
// Test embedder with multiple basic blocks
TEST_F(MIR2VecEmbeddingTestFixture, MultipleBasicBlocks) {
// Create a test vocabulary
- auto VocabOrErr = createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}});
+ auto VocabOrErr =
+ createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}}, {}, {}, {});
ASSERT_TRUE(static_cast<bool>(VocabOrErr))
<< "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
auto &Vocab = *VocabOrErr;
@@ -431,7 +497,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) {
MF->push_back(MBB);
// Create embedder
- auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 2);
+ auto VocabOrErr =
+ MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 2);
ASSERT_TRUE(static_cast<bool>(VocabOrErr))
<< "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
auto &V = *VocabOrErr;
@@ -452,7 +519,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) {
TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) {
// Create a test vocabulary with limited entries
// SUB is intentionally not included
- auto VocabOrErr = createTestVocab({{"ADD", 1.0f}});
+ auto VocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {});
ASSERT_TRUE(static_cast<bool>(VocabOrErr))
<< "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
auto &Vocab = *VocabOrErr;
@@ -494,4 +561,210 @@ TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) {
Embedding ExpectedBBVector(2, 1.0f * ExpectedWeight);
EXPECT_TRUE(MBBVector.approximatelyEquals(ExpectedBBVector));
}
+
+// Test vocabulary string key generation
+TEST_F(MIR2VecEmbeddingTestFixture, VocabularyStringKeys) {
+ auto VocabOrErr =
+ createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 2);
+ ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+ << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+ auto &Vocab = *VocabOrErr;
+
+ // Test that we can get string keys for all positions
+ for (size_t Pos = 0; Pos < Vocab.getCanonicalSize(); ++Pos) {
+ std::string Key = Vocab.getStringKey(Pos);
+ EXPECT_FALSE(Key.empty()) << "Empty key at position " << Pos;
+ }
+
+ // Test specific known positions if we can identify them
+ unsigned AddIndex = Vocab.getCanonicalIndexForBaseName("ADD");
+ std::string AddKey = Vocab.getStringKey(AddIndex);
+ EXPECT_EQ(AddKey, "ADD");
+
+ unsigned SubIndex = Vocab.getCanonicalIndexForBaseName("SUB");
+ std::string SubKey = Vocab.getStringKey(SubIndex);
+ EXPECT_EQ(SubKey, "SUB");
+
+ unsigned ImmIndex = Vocab.getCanonicalIndexForOperandName("Immediate");
+ std::string ImmKey = Vocab.getStringKey(ImmIndex);
+ EXPECT_EQ(ImmKey, "Immediate");
+
+ unsigned PhyRegIndex = Vocab.getCanonicalIndexForRegisterClass("GR32", true);
+ std::string PhyRegKey = Vocab.getStringKey(PhyRegIndex);
+ EXPECT_EQ(PhyRegKey, "PhyReg_GR32");
+
+ unsigned VirtRegIndex =
+ Vocab.getCanonicalIndexForRegisterClass("GR32", false);
+ std::string VirtRegKey = Vocab.getStringKey(VirtRegIndex);
+ EXPECT_EQ(VirtRegKey, "VirtReg_GR32");
+}
+
+// Test vocabulary dimension consistency
+TEST_F(MIR2VecEmbeddingTestFixture, DimensionConsistency) {
+ auto VocabOrErr = createTestVocab({{"TEST", 1.0f}}, {}, {}, {}, 5);
+ ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+ << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+ auto &Vocab = *VocabOrErr;
+
+ EXPECT_EQ(Vocab.getDimension(), 5u);
+
+ // All embeddings should have the same dimension
+ for (auto IT = Vocab.begin(); IT != Vocab.end(); ++IT)
+ EXPECT_EQ((*IT).size(), 5u);
+}
+
+// Test invalid register handling through machine instruction creation
+TEST_F(MIR2VecEmbeddingTestFixture, InvalidRegisterHandling) {
+ float MOVValue = 1.5f;
+ float ImmValue = 0.5f;
+ float PhyRegValue = 0.2f;
+ auto VocabOrErr = createTestVocab(
+ {{"MOV", MOVValue}}, {{"Immediate", ImmValue}},
+ {{"GR8_ABCD_H", PhyRegValue}, {"GR8_ABCD_L", PhyRegValue + 0.1f}}, {}, 3);
+ ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+ << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+ auto &Vocab = *VocabOrErr;
+
+ MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+ MF->push_back(MBB);
+
+ // Create a MOV instruction with actual operands including potential $noreg
+ // This tests the actual scenario where invalid registers are encountered
+ auto MovOpcode = findOpcodeByName("MOV32mr");
+ ASSERT_NE(MovOpcode, -1) << "MOV32mr opcode not found";
+ const MCInstrDesc &Desc = TII->get(MovOpcode);
+
+ // Use available physical registers from the target
+ unsigned BaseReg =
+ TRI->getNumRegs() > 1 ? 1 : 0; // First available physical register
+ unsigned ValueReg = TRI->getNumRegs() > 2 ? 2 : BaseReg;
+
+ // MOV32mr typically has: base, scale, index, displacement, segment, value
+ // Use the MachineInstrBuilder API properly
+ auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc)
+ .addReg(BaseReg) // base
+ .addImm(1) // scale
+ .addReg(0) // index ($noreg)
+ .addImm(-4) // displacement
+ .addReg(0) // segment ($noreg)
+ .addReg(ValueReg); // value
+
+ auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+ ASSERT_TRUE(Embedder != nullptr);
+
+ // This should not crash even if the instruction has $noreg operands
+ auto InstEmb = Embedder->getMInstVector(*MovInst);
+ EXPECT_EQ(InstEmb.size(), 3u);
+
+ // Test the expected embedding value
+ Embedding ExpectedOpcodeContribution(3, MOVValue * mir2vec::OpcWeight);
+ auto ExpectedOperandContribution =
+ Embedding(3, PhyRegValue * mir2vec::RegOperandWeight) // Base
+ + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // Scale
+ + Embedding(3, 0.0f) // noreg
+ + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // displacement
+ + Embedding(3, 0.0f) // noreg
+ + Embedding(3, (PhyRegValue + 0.1f) * mir2vec::RegOperandWeight); // Value
+ auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution;
+ EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+ << "MOV instruction embedding should match expected embedding";
+}
+
+// Test handling of both physical and virtual registers in an instruction
+TEST_F(MIR2VecEmbeddingTestFixture, PhysicalAndVirtualRegisterHandling) {
+ float MOVValue = 2.0f;
+ float ImmValue = 0.7f;
+ float PhyRegValue = 0.3f;
+ float VirtRegValue = 0.9f;
+
+ // Find GR32 register class
+ const TargetRegisterClass *GR32RC = nullptr;
+ for (unsigned i = 0; i < TRI->getNumRegClasses(); ++i) {
+ const TargetRegisterClass *RC = TRI->getRegClass(i);
+ if (std::string(TRI->getRegClassName(RC)) == "GR32") {
+ GR32RC = RC;
+ break;
+ }
+ }
+ ASSERT_TRUE(GR32RC != nullptr && GR32RC->isAllocatable())
+ << "No allocatable GR32 register class found";
+
+ // Get first available physical register from GR32
+ unsigned PhyReg = *GR32RC->begin();
+ // Create a virtual register of class GR32
+ unsigned VirtReg = MF->getRegInfo().createVirtualRegister(GR32RC);
+
+ // Create vocabulary with register class based keys
+ auto VocabOrErr =
+ createTestVocab({{"MOV", MOVValue}}, {{"Immediate", ImmValue}},
+ {{"GR32_AD", PhyRegValue}}, // GR32_AD is the minimal key
+ {{"GR32", VirtRegValue}}, 4);
+ ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+ << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+ auto &Vocab = *VocabOrErr;
+
+ MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+ MF->push_back(MBB);
+
+ // Create a MOV32rr instruction: MOV32rr dst, src
+ auto MovOpcode = findOpcodeByName("MOV32rr");
+ ASSERT_NE(MovOpcode, -1) << "MOV32rr opcode not found";
+ const MCInstrDesc &Desc = TII->get(MovOpcode);
+
+ // MOV32rr: dst (physical), src (virtual)
+ auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc)
+ .addReg(PhyReg) // physical register destination
+ .addReg(VirtReg); // virtual register source
+
+ // Create embedder with virtual register support
+ auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+ ASSERT_TRUE(Embedder != nullptr);
+
+ // This should not crash and should produce a valid embedding
+ auto InstEmb = Embedder->getMInstVector(*MovInst);
+ EXPECT_EQ(InstEmb.size(), 4u);
+
+ // Test the expected embedding value
+ Embedding ExpectedOpcodeContribution(4, MOVValue * mir2vec::OpcWeight);
+ auto ExpectedOperandContribution =
+ Embedding(4, PhyRegValue * mir2vec::RegOperandWeight) // dst (physical)
+ + Embedding(4, VirtRegValue * mir2vec::RegOperandWeight); // src (virtual)
+ auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution;
+ EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+ << "MOV32rr instruction embedding should match expected embedding";
+}
+
+// Test precise embedding calculation with known operands
+TEST_F(MIR2VecEmbeddingTestFixture, EmbeddingCalculation) {
+ auto VocabOrErr = createTestVocab({{"NOOP", 2.0f}}, {}, {}, {}, 2);
+ ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+ << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+ auto &Vocab = *VocabOrErr;
+
+ MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+ MF->push_back(MBB);
+
+ // Create a simple NOOP instruction (no operands)
+ auto NoopInst = createMachineInstr(*MBB, "NOOP");
+ ASSERT_TRUE(NoopInst != nullptr);
+
+ auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+ ASSERT_TRUE(Embedder != nullptr);
+
+ // Get the instruction embedding
+ auto InstEmb = Embedder->getMInstVector(*NoopInst);
+ EXPECT_EQ(InstEmb.size(), 2u);
+
+ // For NOOP with no operands, the embedding should be exactly the opcode
+ // embedding
+ float ExpectedWeight = mir2vec::OpcWeight;
+ Embedding ExpectedEmb(2, 2.0f * ExpectedWeight);
+
+ EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+ << "NOOP instruction embedding should match opcode embedding";
+
+ // Verify individual components
+ EXPECT_FLOAT_EQ(InstEmb[0], 2.0f * ExpectedWeight);
+ EXPECT_FLOAT_EQ(InstEmb[1], 2.0f * ExpectedWeight);
+}
} // namespace
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 16b9979..aa56aaf 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -550,6 +550,31 @@ TEST_F(SelectionDAGPatternMatchTest, matchNode) {
EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_ConstInt(), m_Value())));
}
+TEST_F(SelectionDAGPatternMatchTest, matchSelectLike) {
+ SDLoc DL;
+ auto Int32VT = EVT::getIntegerVT(Context, 32);
+ auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+
+ SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, Int32VT);
+ SDValue TVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+ SDValue FVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+
+ SDValue VCond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, VInt32VT);
+ SDValue VTVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, VInt32VT);
+ SDValue VFVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+
+ SDValue Select = DAG->getNode(ISD::SELECT, DL, Int32VT, Cond, TVal, FVal);
+ SDValue VSelect =
+ DAG->getNode(ISD::VSELECT, DL, Int32VT, VCond, VTVal, VFVal);
+
+ using namespace SDPatternMatch;
+ EXPECT_TRUE(sd_match(Select, m_SelectLike(m_Specific(Cond), m_Specific(TVal),
+ m_Specific(FVal))));
+ EXPECT_TRUE(
+ sd_match(VSelect, m_SelectLike(m_Specific(VCond), m_Specific(VTVal),
+ m_Specific(VFVal))));
+}
+
namespace {
struct VPMatchContext : public SDPatternMatch::BasicMatchContext {
using SDPatternMatch::BasicMatchContext::BasicMatchContext;
diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index 3c9374b..4235c93 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -716,3 +716,32 @@ attributes #0 = { presplitcoroutine }
EXPECT_FALSE(llvm::isPresplitCoroSuspendExitEdge(
*ExitN.getSinglePredecessor(), ExitN));
}
+
+TEST(BasicBlockUtils, BasicBlockPrintable) {
+ std::string S;
+ std::string SCheck;
+ llvm::raw_string_ostream OS{S};
+ llvm::raw_string_ostream OSCheck{SCheck};
+
+ LLVMContext C;
+ std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @foo() {
+ br label %bb0
+bb0:
+ br label %.exit
+.exit:
+ ret void
+}
+)IR");
+
+ Function *F = M->getFunction("foo");
+ for (const BasicBlock &BB : *F) {
+ OS << printBasicBlock(&BB);
+ BB.printAsOperand(OSCheck);
+ EXPECT_EQ(OS.str(), OSCheck.str());
+ S.clear();
+ SCheck.clear();
+ }
+ OS << printBasicBlock(nullptr);
+ EXPECT_EQ(OS.str(), "<nullptr>");
+}
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 38ba466..df9ddf9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -45,6 +45,7 @@ static_library("Support") {
"ARMAttributeParser.cpp",
"ARMBuildAttributes.cpp",
"ARMWinEH.cpp",
+ "AllocToken.cpp",
"Allocator.cpp",
"AutoConvert.cpp",
"BalancedPartitioning.cpp",
diff --git a/llvm/utils/update_mc_test_checks.py b/llvm/utils/update_mc_test_checks.py
index ab7fe19..67fff56 100755
--- a/llvm/utils/update_mc_test_checks.py
+++ b/llvm/utils/update_mc_test_checks.py
@@ -290,11 +290,9 @@ def update_test(ti: common.TestInfo):
# prefix is selected and generated with most shared output lines
# each run_id can only be used once
- gen_prefix = ""
used_runid = set()
- # line number diff between generated prefix and testline
- line_offset = 1
+ selected_prefixes = set()
for prefix, tup in p_dict_sorted.items():
o, run_ids = tup
@@ -308,18 +306,24 @@ def update_test(ti: common.TestInfo):
else:
used_runid.add(i)
if not skip:
- used_prefixes.add(prefix)
+ selected_prefixes.add(prefix)
- if hasErr(o):
- newline = getErrCheckLine(prefix, o, mc_mode, line_offset)
- else:
- newline = getStdCheckLine(prefix, o, mc_mode)
+ # Generate check lines in alphabetical order.
+ check_lines = []
+ for prefix in sorted(selected_prefixes):
+ o, run_ids = p_dict[prefix]
+ used_prefixes.add(prefix)
+
+ if hasErr(o):
+ line_offset = len(check_lines) + 1
+ check = getErrCheckLine(prefix, o, mc_mode, line_offset)
+ else:
+ check = getStdCheckLine(prefix, o, mc_mode)
- if newline:
- gen_prefix += newline
- line_offset += 1
+ if check:
+ check_lines.append(check.strip())
- generated_prefixes[input_line] = gen_prefix.rstrip("\n")
+ generated_prefixes[input_line] = "\n".join(check_lines)
# write output
for input_info in ti.iterlines(output_lines):