aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorXiaofeng Tian <110771974+txff99@users.noreply.github.com>2024-09-11 14:00:54 -0700
committerVitaly Buka <vitalybuka@google.com>2024-09-11 14:00:54 -0700
commit8ee4ba92e1a85fd220caf2f83d5d86c334d432db (patch)
tree847bf2e4f29d84e1369a9d4ef2d1e69c637a36de
parent01006d0bc93e5c6a6953ea8874eadb56ec719399 (diff)
parentc2b93e0671d8cfd6b1a24c6e1d7be290125b8974 (diff)
downloadllvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.zip
llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.gz
llvm-users/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912.tar.bz2
[𝘀𝗽𝗿] changes introduced through rebaseusers/vitalybuka/spr/main.nfcsanitizer-commit-test-for-106912
Created using spr 1.3.4 [skip ci]
-rw-r--r--clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp6
-rw-r--r--clang/lib/CodeGen/Targets/AArch64.cpp6
-rw-r--r--clang/lib/Sema/HLSLExternalSemaSource.cpp10
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp19
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h5
-rw-r--r--clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp3
-rw-r--r--clang/lib/Tooling/Transformer/Stencil.cpp8
-rw-r--r--clang/test/AST/HLSL/StructuredBuffer-AST.hlsl64
-rw-r--r--clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp11
-rw-r--r--clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm26
-rw-r--r--clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl22
-rw-r--r--clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl12
-rw-r--r--clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl52
-rw-r--r--clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl16
-rw-r--r--clang/test/CodeGenHLSL/loops/unroll.hlsl46
-rw-r--r--clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl19
-rw-r--r--clang/unittests/Tooling/StencilTest.cpp22
-rw-r--r--clang/utils/TableGen/ASTTableGen.cpp19
-rw-r--r--clang/utils/TableGen/ASTTableGen.h37
-rw-r--r--clang/utils/TableGen/ClangBuiltinsEmitter.cpp2
-rw-r--r--clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp14
-rw-r--r--clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp16
-rw-r--r--clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp13
-rw-r--r--clang/utils/TableGen/ClangDataCollectorsEmitter.cpp2
-rw-r--r--clang/utils/TableGen/ClangOpcodesEmitter.cpp6
-rw-r--r--clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp46
-rw-r--r--clang/utils/TableGen/ClangOptionDocEmitter.cpp46
-rw-r--r--clang/utils/TableGen/TableGenBackends.h27
-rw-r--r--compiler-rt/lib/builtins/cpu_model/aarch64.c2
-rw-r--r--compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp4
-rw-r--r--compiler-rt/test/asan/TestCases/Linux/stress_dtls.c5
-rw-r--r--flang/lib/Semantics/expression.cpp1
-rw-r--r--libc/cmake/modules/LLVMLibCCompileOptionRules.cmake4
-rw-r--r--libc/config/config.json6
-rw-r--r--libc/docs/configure.rst2
-rw-r--r--libc/hdr/CMakeLists.txt18
-rw-r--r--libc/hdr/link_macros.h22
-rw-r--r--libc/hdr/sys_auxv_macros.h22
-rw-r--r--libc/src/__support/CMakeLists.txt3
-rw-r--r--libc/src/__support/OSUtil/linux/CMakeLists.txt30
-rw-r--r--libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt10
-rw-r--r--libc/src/__support/OSUtil/linux/aarch64/vdso.h37
-rw-r--r--libc/src/__support/OSUtil/linux/arm/CMakeLists.txt10
-rw-r--r--libc/src/__support/OSUtil/linux/arm/vdso.h37
-rw-r--r--libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt10
-rw-r--r--libc/src/__support/OSUtil/linux/riscv/vdso.h43
-rw-r--r--libc/src/__support/OSUtil/linux/vdso.cpp237
-rw-r--r--libc/src/__support/OSUtil/linux/vdso.h81
-rw-r--r--libc/src/__support/OSUtil/linux/vdso_sym.h70
-rw-r--r--libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt10
-rw-r--r--libc/src/__support/OSUtil/linux/x86_64/vdso.h43
-rw-r--r--libc/src/__support/macros/CMakeLists.txt10
-rw-r--r--libc/src/__support/macros/null_check.h33
-rw-r--r--libc/src/__support/macros/sanitizer.h21
-rw-r--r--libc/src/__support/str_to_float.h4
-rw-r--r--libc/src/stdlib/atexit.cpp6
-rw-r--r--libc/src/stdlib/quick_exit.cpp3
-rw-r--r--libc/src/sys/auxv/getauxval.h2
-rw-r--r--libc/startup/gpu/amdgpu/start.cpp3
-rw-r--r--libc/startup/gpu/nvptx/start.cpp3
-rw-r--r--libc/startup/linux/do_start.cpp11
-rw-r--r--libc/test/src/__support/OSUtil/linux/CMakeLists.txt18
-rw-r--r--libc/test/src/__support/OSUtil/linux/vdso_test.cpp162
-rw-r--r--libc/test/src/compiler/CMakeLists.txt1
-rw-r--r--libc/test/src/compiler/stack_chk_guard_test.cpp6
-rw-r--r--libc/test/src/math/smoke/CMakeLists.txt14
-rw-r--r--libc/test/src/math/smoke/nan_test.cpp7
-rw-r--r--libc/test/src/math/smoke/nanf128_test.cpp7
-rw-r--r--libc/test/src/math/smoke/nanf16_test.cpp7
-rw-r--r--libc/test/src/math/smoke/nanf_test.cpp7
-rw-r--r--libc/test/src/math/smoke/nanl_test.cpp7
-rw-r--r--libcxx/include/CMakeLists.txt1
-rw-r--r--libcxx/include/__config9
-rw-r--r--libcxx/include/__pstl/backend.h24
-rw-r--r--libcxx/include/__pstl/backend_fwd.h16
-rw-r--r--libcxx/include/__pstl/backends/default.h4
-rw-r--r--libcxx/include/__pstl/backends/libdispatch.h12
-rw-r--r--libcxx/include/__pstl/backends/serial.h4
-rw-r--r--libcxx/include/__pstl/backends/std_thread.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/any_of.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/cpu_traits.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/fill.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/find_if.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/for_each.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/merge.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/stable_sort.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/transform.h4
-rw-r--r--libcxx/include/__pstl/cpu_algos/transform_reduce.h4
-rw-r--r--libcxx/include/__pstl/dispatch.h4
-rw-r--r--libcxx/include/__pstl/handle_exception.h4
-rw-r--r--libcxx/include/experimental/__config45
-rw-r--r--libcxx/include/experimental/__simd/aligned_tag.h2
-rw-r--r--libcxx/include/experimental/__simd/declaration.h9
-rw-r--r--libcxx/include/experimental/__simd/reference.h2
-rw-r--r--libcxx/include/experimental/__simd/scalar.h2
-rw-r--r--libcxx/include/experimental/__simd/simd.h2
-rw-r--r--libcxx/include/experimental/__simd/simd_mask.h2
-rw-r--r--libcxx/include/experimental/__simd/traits.h2
-rw-r--r--libcxx/include/experimental/__simd/utility.h2
-rw-r--r--libcxx/include/experimental/__simd/vec_ext.h2
-rw-r--r--libcxx/include/experimental/iterator2
-rw-r--r--libcxx/include/experimental/memory2
-rw-r--r--libcxx/include/experimental/propagate_const2
-rw-r--r--libcxx/include/experimental/simd2
-rw-r--r--libcxx/include/experimental/type_traits2
-rw-r--r--libcxx/include/experimental/utility2
-rw-r--r--libcxx/include/module.modulemap4
-rw-r--r--libcxx/src/any.cpp2
-rw-r--r--libcxx/src/optional.cpp2
-rw-r--r--lld/test/wasm/static-error.s12
-rw-r--r--lld/wasm/Driver.cpp10
-rw-r--r--lldb/include/lldb/API/SBMemoryRegionInfo.h2
-rw-r--r--lldb/include/lldb/API/SBSaveCoreOptions.h11
-rw-r--r--lldb/include/lldb/Core/SourceManager.h7
-rw-r--r--lldb/include/lldb/Symbol/SaveCoreOptions.h11
-rw-r--r--lldb/include/lldb/Target/CoreFileMemoryRanges.h50
-rw-r--r--lldb/include/lldb/Target/Process.h25
-rw-r--r--lldb/include/lldb/Utility/RangeMap.h6
-rw-r--r--lldb/include/lldb/lldb-enumerations.h1
-rw-r--r--lldb/include/lldb/lldb-forward.h1
-rw-r--r--lldb/include/lldb/lldb-private-interfaces.h1
-rw-r--r--lldb/source/API/SBSaveCoreOptions.cpp11
-rw-r--r--lldb/source/Commands/CommandObjectProcess.cpp1
-rw-r--r--lldb/source/Core/SourceManager.cpp24
-rw-r--r--lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp6
-rw-r--r--lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h1
-rw-r--r--lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp35
-rw-r--r--lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h5
-rw-r--r--lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h1
-rw-r--r--lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp1
-rw-r--r--lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h1
-rw-r--r--lldb/source/Symbol/SaveCoreOptions.cpp14
-rw-r--r--lldb/source/Target/CMakeLists.txt1
-rw-r--r--lldb/source/Target/CoreFileMemoryRanges.cpp86
-rw-r--r--lldb/source/Target/Process.cpp76
-rw-r--r--lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py149
-rw-r--r--lldb/test/Shell/SymbolFile/Inputs/main.c4
-rw-r--r--lldb/test/Shell/SymbolFile/checksum-mismatch.test7
-rw-r--r--lldb/unittests/Process/Utility/CMakeLists.txt1
-rw-r--r--lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp205
-rw-r--r--llvm/benchmarks/SandboxIRBench.cpp72
-rw-r--r--llvm/include/llvm/ADT/DenseMap.h165
-rw-r--r--llvm/include/llvm/ADT/DenseSet.h55
-rw-r--r--llvm/include/llvm/Analysis/MemorySSA.h28
-rw-r--r--llvm/include/llvm/IR/Constant.h4
-rw-r--r--llvm/include/llvm/IR/Constants.h35
-rw-r--r--llvm/include/llvm/IR/DerivedUser.h4
-rw-r--r--llvm/include/llvm/IR/Function.h7
-rw-r--r--llvm/include/llvm/IR/GlobalAlias.h4
-rw-r--r--llvm/include/llvm/IR/GlobalIFunc.h4
-rw-r--r--llvm/include/llvm/IR/GlobalObject.h7
-rw-r--r--llvm/include/llvm/IR/GlobalValue.h6
-rw-r--r--llvm/include/llvm/IR/GlobalVariable.h31
-rw-r--r--llvm/include/llvm/IR/InstrTypes.h22
-rw-r--r--llvm/include/llvm/IR/Instruction.h2
-rw-r--r--llvm/include/llvm/IR/Instructions.h262
-rw-r--r--llvm/include/llvm/IR/User.h105
-rw-r--r--llvm/include/llvm/SandboxIR/SandboxIR.h15
-rw-r--r--llvm/include/llvm/Support/Casting.h2
-rw-r--r--llvm/lib/IR/Constants.cpp36
-rw-r--r--llvm/lib/IR/ConstantsContext.h44
-rw-r--r--llvm/lib/IR/Function.cpp5
-rw-r--r--llvm/lib/IR/Globals.cpp11
-rw-r--r--llvm/lib/IR/Instruction.cpp4
-rw-r--r--llvm/lib/IR/Instructions.cpp235
-rw-r--r--llvm/lib/IR/User.cpp20
-rw-r--r--llvm/lib/Passes/PassBuilderPipelines.cpp6
-rw-r--r--llvm/lib/SandboxIR/SandboxIR.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp163
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td21
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp57
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp232
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h1
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp10
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp19
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td4
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp17
-rw-r--r--llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp14
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll50
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll70
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll64
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll8
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll12
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll16
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll50
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll24
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll70
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll80
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll50
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll70
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll80
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll4
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll50
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll70
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll64
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll8
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll12
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll16
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll50
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll24
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll70
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll80
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll50
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll70
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll80
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll32
-rw-r--r--llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll32
-rw-r--r--llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll9
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll243
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll758
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll3808
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll37
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll550
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll357
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll357
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll550
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll55
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll253
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/wqm.mir29
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll648
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved.ll660
-rw-r--r--llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll556
-rw-r--r--llvm/test/CodeGen/NVPTX/lower-byval-args.ll573
-rw-r--r--llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir104
-rw-r--r--llvm/test/CodeGen/WebAssembly/half-precision.ll29
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll4
-rw-r--r--llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll696
-rw-r--r--llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll93
-rw-r--r--llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll3
-rw-r--r--llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll74
-rw-r--r--llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll1
-rw-r--r--llvm/tools/llvm-dis/llvm-dis.cpp18
-rw-r--r--llvm/unittests/SandboxIR/SandboxIRTest.cpp11
-rw-r--r--llvm/unittests/SandboxIR/TrackerTest.cpp26
-rw-r--r--llvm/utils/TableGen/AsmMatcherEmitter.cpp20
-rw-r--r--llvm/utils/TableGen/CodeEmitterGen.cpp4
-rw-r--r--llvm/utils/TableGen/CodeGenMapTable.cpp4
-rw-r--r--llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp49
-rw-r--r--llvm/utils/TableGen/Common/CodeGenDAGPatterns.h61
-rw-r--r--llvm/utils/TableGen/Common/CodeGenInstruction.cpp10
-rw-r--r--llvm/utils/TableGen/Common/CodeGenInstruction.h8
-rw-r--r--llvm/utils/TableGen/Common/CodeGenRegisters.cpp41
-rw-r--r--llvm/utils/TableGen/Common/CodeGenRegisters.h28
-rw-r--r--llvm/utils/TableGen/Common/CodeGenSchedule.cpp2
-rw-r--r--llvm/utils/TableGen/Common/DAGISelMatcher.h6
-rw-r--r--llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp2
-rw-r--r--llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h5
-rw-r--r--llvm/utils/TableGen/Common/InfoByHwMode.cpp8
-rw-r--r--llvm/utils/TableGen/Common/InfoByHwMode.h7
-rw-r--r--llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp10
-rw-r--r--llvm/utils/TableGen/Common/SubtargetFeatureInfo.h12
-rw-r--r--llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp8
-rw-r--r--llvm/utils/TableGen/DAGISelEmitter.cpp4
-rw-r--r--llvm/utils/TableGen/DAGISelMatcherEmitter.cpp10
-rw-r--r--llvm/utils/TableGen/DAGISelMatcherGen.cpp8
-rw-r--r--llvm/utils/TableGen/FastISelEmitter.cpp8
-rw-r--r--llvm/utils/TableGen/GlobalISelEmitter.cpp20
-rw-r--r--llvm/utils/TableGen/InstrDocsEmitter.cpp2
-rw-r--r--llvm/utils/TableGen/InstrInfoEmitter.cpp21
-rw-r--r--llvm/utils/TableGen/X86FoldTablesEmitter.cpp6
-rw-r--r--llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn1
-rw-r--r--mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h2
-rw-r--r--mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt4
-rw-r--r--mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td58
-rw-r--r--mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td61
-rw-r--r--mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h13
-rw-r--r--mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp7
-rw-r--r--mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp21
-rw-r--r--mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp64
-rw-r--r--mlir/lib/Dialect/Tosa/IR/TosaOps.cpp135
-rw-r--r--mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir10
-rw-r--r--mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir2
-rw-r--r--mlir/test/Dialect/Tosa/constant-op-fold.mlir4
-rw-r--r--mlir/test/Dialect/Tosa/invalid.mlir74
-rw-r--r--mlir/test/Dialect/Tosa/ops.mlir16
-rw-r--r--utils/bazel/llvm-project-overlay/libc/BUILD.bazel11
288 files changed, 10031 insertions, 7117 deletions
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
index 8116db5..98c592d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp
@@ -187,14 +187,14 @@ void lambda_value_reference_auxiliary_var(T&& t) {
namespace deleted_functions {
template <typename T>
-void f(T &&) = delete;
+void f(T &&t) = delete;
struct S {
template <typename T>
- S(T &&) = delete;
+ S(T &&t) = delete;
template <typename T>
- void operator&(T &&) = delete;
+ void operator&(T &&t) = delete;
};
} // namespace deleted_functions
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 2f119fe..ec617ee 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -500,7 +500,7 @@ bool AArch64SwiftABIInfo::isLegalVectorType(CharUnits VectorSize,
bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
// For the soft-float ABI variant, no types are considered to be homogeneous
// aggregates.
- if (Kind == AArch64ABIKind::AAPCSSoft)
+ if (isSoftFloat())
return false;
// Homogeneous aggregates for AAPCS64 must have base types of a floating
@@ -555,8 +555,8 @@ RValue AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty,
BaseTy = ArrTy->getElementType();
NumRegs = ArrTy->getNumElements();
}
- bool IsFPR = Kind != AArch64ABIKind::AAPCSSoft &&
- (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy());
+ bool IsFPR =
+ !isSoftFloat() && (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy());
// The AArch64 va_list type and handling is specified in the Procedure Call
// Standard, section B.4:
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 071e64f..da7bbf8 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -525,6 +525,16 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
.addArraySubscriptOperators()
.completeDefinition();
});
+
+ Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "StructuredBuffer")
+ .addSimpleTemplateParams(*SemaPtr, {"element_type"})
+ .Record;
+ onCompletion(Decl, [this](CXXRecordDecl *Decl) {
+ setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
+ ResourceKind::TypedBuffer, /*IsROV=*/false)
+ .addArraySubscriptOperators()
+ .completeDefinition();
+ });
}
void HLSLExternalSemaSource::onCompletion(CXXRecordDecl *Record,
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 49bbff1..f48b2fd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -143,6 +143,16 @@ bool isReturnValueRefCounted(const clang::FunctionDecl *F) {
return false;
}
+std::optional<bool> isUncounted(const QualType T) {
+ if (auto *Subst = dyn_cast<SubstTemplateTypeParmType>(T)) {
+ if (auto *Decl = Subst->getAssociatedDecl()) {
+ if (isRefType(safeGetName(Decl)))
+ return false;
+ }
+ }
+ return isUncounted(T->getAsCXXRecordDecl());
+}
+
std::optional<bool> isUncounted(const CXXRecordDecl* Class)
{
// Keep isRefCounted first as it's cheaper.
@@ -231,11 +241,9 @@ bool isSingleton(const FunctionDecl *F) {
if (!MethodDecl->isStatic())
return false;
}
- const auto &Name = safeGetName(F);
- std::string SingletonStr = "singleton";
- auto index = Name.find(SingletonStr);
- return index != std::string::npos &&
- index == Name.size() - SingletonStr.size();
+ const auto &NameStr = safeGetName(F);
+ StringRef Name = NameStr; // FIXME: Make safeGetName return StringRef.
+ return Name == "singleton" || Name.ends_with("Singleton");
}
// We only care about statements so let's use the simple
@@ -397,6 +405,7 @@ public:
return true;
if (Name == "WTFCrashWithInfo" || Name == "WTFBreakpointTrap" ||
+ Name == "WTFReportBacktrace" ||
Name == "WTFCrashWithSecurityImplication" || Name == "WTFCrash" ||
Name == "WTFReportAssertionFailure" || Name == "isMainThread" ||
Name == "isMainThreadOrGCThread" || Name == "isMainRunLoop" ||
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index ec1db1c..2932e62 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -20,6 +20,7 @@ class CXXMethodDecl;
class CXXRecordDecl;
class Decl;
class FunctionDecl;
+class QualType;
class Stmt;
class Type;
@@ -44,6 +45,10 @@ bool isRefCounted(const clang::CXXRecordDecl *Class);
/// \returns true if \p Class is ref-countable AND not ref-counted, false if
/// not, std::nullopt if inconclusive.
+std::optional<bool> isUncounted(const clang::QualType T);
+
+/// \returns true if \p Class is ref-countable AND not ref-counted, false if
+/// not, std::nullopt if inconclusive.
std::optional<bool> isUncounted(const clang::CXXRecordDecl* Class);
/// \returns true if \p T is either a raw pointer or reference to an uncounted
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index 704c082..81c2434 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -87,8 +87,7 @@ public:
}
auto *E = MemberCallExpr->getImplicitObjectArgument();
QualType ArgType = MemberCallExpr->getObjectType();
- std::optional<bool> IsUncounted =
- isUncounted(ArgType->getAsCXXRecordDecl());
+ std::optional<bool> IsUncounted = isUncounted(ArgType);
if (IsUncounted && *IsUncounted && !isPtrOriginSafe(E))
reportBugOnThis(E);
}
diff --git a/clang/lib/Tooling/Transformer/Stencil.cpp b/clang/lib/Tooling/Transformer/Stencil.cpp
index bc4fa6e..223fb5a 100644
--- a/clang/lib/Tooling/Transformer/Stencil.cpp
+++ b/clang/lib/Tooling/Transformer/Stencil.cpp
@@ -50,7 +50,13 @@ static Error printNode(StringRef Id, const MatchFinder::MatchResult &Match,
auto NodeOrErr = getNode(Match.Nodes, Id);
if (auto Err = NodeOrErr.takeError())
return Err;
- NodeOrErr->print(Os, PrintingPolicy(Match.Context->getLangOpts()));
+ const PrintingPolicy PP(Match.Context->getLangOpts());
+ if (const auto *ND = NodeOrErr->get<NamedDecl>()) {
+ // For NamedDecls, we can do a better job than printing the whole thing.
+ ND->getNameForDiagnostic(Os, PP, false);
+ } else {
+ NodeOrErr->print(Os, PP);
+ }
*Result += Output;
return Error::success();
}
diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
new file mode 100644
index 0000000..42991d8
--- /dev/null
+++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
@@ -0,0 +1,64 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s
+
+
+// This test tests two different AST generations. The "EMPTY" test mode verifies
+// the AST generated by forward declaration of the HLSL types which happens on
+// initializing the HLSL external AST with an AST Context.
+
+// The non-empty mode has a use that requires the StructuredBuffer type be complete,
+// which results in the AST being populated by the external AST source. That
+// case covers the full implementation of the template declaration and the
+// instantiated specialization.
+
+// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class StructuredBuffer
+// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+
+// There should be no more occurrances of StructuredBuffer
+// EMPTY-NOT: StructuredBuffer
+
+#ifndef EMPTY
+
+StructuredBuffer<float> Buffer;
+
+#endif
+
+// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class StructuredBuffer definition
+
+// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h 'element_type *'
+// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
+
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int'
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class StructuredBuffer definition
+
+// CHECK: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
+// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit referenced h 'float *'
+// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit UAV
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index a98c6eb9c..97efb35 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -6,6 +6,7 @@
void WTFBreakpointTrap();
void WTFCrashWithInfo(int, const char*, const char*, int);
void WTFReportAssertionFailure(const char* file, int line, const char* function, const char* assertion);
+void WTFReportBacktrace(void);
void WTFCrash(void);
void WTFCrashWithSecurityImplication(void);
@@ -334,6 +335,7 @@ public:
}
unsigned trivial60() { return ObjectWithNonTrivialDestructor { 5 }.value(); }
unsigned trivial61() { return DerivedNumber('7').value(); }
+ void trivial62() { WTFReportBacktrace(); }
static RefCounted& singleton() {
static RefCounted s_RefCounted;
@@ -341,6 +343,12 @@ public:
return s_RefCounted;
}
+ static RefCounted& otherSingleton() {
+ static RefCounted s_RefCounted;
+ s_RefCounted.ref();
+ return s_RefCounted;
+ }
+
Number nonTrivial1() { return Number(3) + Number(4); }
Number nonTrivial2() { return Number { 0.3 }; }
int nonTrivial3() { return v ? otherFunction() : 0; }
@@ -506,9 +514,12 @@ public:
getFieldTrivial().trivial59(); // no-warning
getFieldTrivial().trivial60(); // no-warning
getFieldTrivial().trivial61(); // no-warning
+ getFieldTrivial().trivial62(); // no-warning
RefCounted::singleton().trivial18(); // no-warning
RefCounted::singleton().someFunction(); // no-warning
+ RefCounted::otherSingleton().trivial18(); // no-warning
+ RefCounted::otherSingleton().someFunction(); // no-warning
getFieldTrivial().recursiveTrivialFunction(7); // no-warning
getFieldTrivial().recursiveComplexFunction(9);
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm
new file mode 100644
index 0000000..db0c5b1
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm
@@ -0,0 +1,26 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+// expected-no-diagnostics
+
+#import "mock-types.h"
+#import "mock-system-header.h"
+#import "../../Inputs/system-header-simulator-for-objc-dealloc.h"
+
+@interface Foo : NSObject
+
+@property (nonatomic, readonly) RefPtr<RefCountable> countable;
+
+- (void)execute;
+- (RefPtr<RefCountable>)_protectedRefCountable;
+@end
+
+@implementation Foo
+
+- (void)execute {
+ self._protectedRefCountable->method();
+}
+
+- (RefPtr<RefCountable>)_protectedRefCountable {
+ return _countable;
+}
+
+@end
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
new file mode 100644
index 0000000..16b7295
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+StructuredBuffer<float> Buffer1;
+StructuredBuffer<vector<float, 4> > BufferArray[4];
+
+StructuredBuffer<float> Buffer2 : register(u3);
+StructuredBuffer<vector<float, 4> > BufferArray2[4] : register(u4);
+
+StructuredBuffer<float> Buffer3 : register(u3, space1);
+StructuredBuffer<vector<float, 4> > BufferArray3[4] : register(u4, space1);
+
+[numthreads(1,1,1)]
+void main() {
+}
+
+// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
+// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0}
+// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0}
+// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1}
+// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1}
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
new file mode 100644
index 0000000..34019e5
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+
+StructuredBuffer<float> Buf;
+
+// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ"
+// CHECK-NEXT: entry:
+
+// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
+// CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+
+// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
+// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
new file mode 100644
index 0000000..8ddf8a6
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s
+
+StructuredBuffer<int16_t> BufI16;
+StructuredBuffer<uint16_t> BufU16;
+StructuredBuffer<int> BufI32;
+StructuredBuffer<uint> BufU32;
+StructuredBuffer<int64_t> BufI64;
+StructuredBuffer<uint64_t> BufU64;
+StructuredBuffer<half> BufF16;
+StructuredBuffer<float> BufF32;
+StructuredBuffer<double> BufF64;
+StructuredBuffer< vector<int16_t, 4> > BufI16x4;
+StructuredBuffer< vector<uint, 3> > BufU32x3;
+StructuredBuffer<half2> BufF16x2;
+StructuredBuffer<float3> BufF32x3;
+// TODO: StructuredBuffer<snorm half> BufSNormF16; -> 11
+// TODO: StructuredBuffer<unorm half> BufUNormF16; -> 12
+// TODO: StructuredBuffer<snorm float> BufSNormF32; -> 13
+// TODO: StructuredBuffer<unorm float> BufUNormF32; -> 14
+// TODO: StructuredBuffer<snorm double> BufSNormF64; -> 15
+// TODO: StructuredBuffer<unorm double> BufUNormF64; -> 16
+
+[numthreads(1,1,1)]
+void main(int GI : SV_GroupIndex) {
+ BufI16[GI] = 0;
+ BufU16[GI] = 0;
+ BufI32[GI] = 0;
+ BufU32[GI] = 0;
+ BufI64[GI] = 0;
+ BufU64[GI] = 0;
+ BufF16[GI] = 0;
+ BufF32[GI] = 0;
+ BufF64[GI] = 0;
+ BufI16x4[GI] = 0;
+ BufU32x3[GI] = 0;
+ BufF16x2[GI] = 0;
+ BufF32x3[GI] = 0;
+}
+
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$StructuredBuffer@F@hlsl@@A", i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$StructuredBuffer@G@hlsl@@A", i32 10, i32 3,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$StructuredBuffer@H@hlsl@@A", i32 10, i32 4,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$StructuredBuffer@I@hlsl@@A", i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$StructuredBuffer@J@hlsl@@A", i32 10, i32 6,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$StructuredBuffer@K@hlsl@@A", i32 10, i32 7,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$StructuredBuffer@$f16@@hlsl@@A", i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$StructuredBuffer@N@hlsl@@A", i32 10, i32 10,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$StructuredBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$StructuredBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$StructuredBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$StructuredBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9,
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl
new file mode 100644
index 0000000..9bd885d
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s
+
+StructuredBuffer<int> In;
+StructuredBuffer<int> Out;
+
+[numthreads(1,1,1)]
+void main(unsigned GI : SV_GroupIndex) {
+ Out[GI] = In[GI];
+}
+
+// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy
+// and confusing to follow so the match here is pretty weak.
+
+// CHECK: define internal void @"?main@@YAXI@Z"
+// CHECK-NOT: call
+// CHECK: ret void
diff --git a/clang/test/CodeGenHLSL/loops/unroll.hlsl b/clang/test/CodeGenHLSL/loops/unroll.hlsl
index 7389f21..efca074 100644
--- a/clang/test/CodeGenHLSL/loops/unroll.hlsl
+++ b/clang/test/CodeGenHLSL/loops/unroll.hlsl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
// RUN: dxil-pc-shadermodel6.3-library -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s
/*** for ***/
@@ -35,8 +35,8 @@ void for_nested_one_unroll_enable()
for( int j = 0; j < 10; ++j)
s += i + j;
}
-// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE:.*]]
-// CHECK-NOT: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_1_ENABLE:.*]]
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_INNER:.*]]
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_OUTER:.*]]
}
void for_nested_two_unroll_enable()
@@ -111,20 +111,26 @@ void do_enable()
}
-// CHECK: ![[FOR_DISTINCT]] = distinct !{![[FOR_DISTINCT]], ![[FOR_COUNT:.*]]}
-// CHECK: ![[FOR_COUNT]] = !{!"llvm.loop.unroll.count", i32 8}
-// CHECK: ![[FOR_DISABLE]] = distinct !{![[FOR_DISABLE]], ![[DISABLE:.*]]}
-// CHECK: ![[DISABLE]] = !{!"llvm.loop.unroll.disable"}
-// CHECK: ![[FOR_ENABLE]] = distinct !{![[FOR_ENABLE]], ![[ENABLE:.*]]}
-// CHECK: ![[ENABLE]] = !{!"llvm.loop.unroll.enable"}
-// CHECK: ![[FOR_NESTED_ENABLE]] = distinct !{![[FOR_NESTED_ENABLE]], ![[ENABLE]]}
-// CHECK: ![[FOR_NESTED2_ENABLE]] = distinct !{![[FOR_NESTED2_ENABLE]], ![[ENABLE]]}
-// CHECK: ![[FOR_NESTED2_1_ENABLE]] = distinct !{![[FOR_NESTED2_1_ENABLE]], ![[ENABLE]]}
-// CHECK: ![[WHILE_DISTINCT]] = distinct !{![[WHILE_DISTINCT]], ![[WHILE_COUNT:.*]]}
-// CHECK: ![[WHILE_COUNT]] = !{!"llvm.loop.unroll.count", i32 4}
-// CHECK: ![[WHILE_DISABLE]] = distinct !{![[WHILE_DISABLE]], ![[DISABLE]]}
-// CHECK: ![[WHILE_ENABLE]] = distinct !{![[WHILE_ENABLE]], ![[ENABLE]]}
-// CHECK: ![[DO_DISTINCT]] = distinct !{![[DO_DISTINCT]], ![[DO_COUNT:.*]]}
-// CHECK: ![[DO_COUNT]] = !{!"llvm.loop.unroll.count", i32 16}
-// CHECK: ![[DO_DISABLE]] = distinct !{![[DO_DISABLE]], ![[DISABLE]]}
-// CHECK: ![[DO_ENABLE]] = distinct !{![[DO_ENABLE]], ![[ENABLE]]}
+// CHECK-DAG: [[MUST_PROGRESS:.*]] = !{!"llvm.loop.mustprogress"}
+// CHECK-DAG: [[DISABLE:.*]] = !{!"llvm.loop.unroll.disable"}
+// CHECK-DAG: [[FOR_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 8}
+// CHECK-DAG: [[ENABLE:.*]] = !{!"llvm.loop.unroll.enable"}
+// CHECK-DAG: [[WHILE_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 4}
+// CHECK-DAG: [[DO_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 16}
+
+// CHECK-DAG: ![[FOR_DISTINCT]] = distinct !{![[FOR_DISTINCT]], [[MUST_PROGRESS]], [[FOR_COUNT]]}
+// CHECK-DAG: ![[FOR_DISABLE]] = distinct !{![[FOR_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]}
+// CHECK-DAG: ![[FOR_ENABLE]] = distinct !{![[FOR_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+
+// CHECK-DAG: ![[FOR_NESTED_ENABLE_INNER]] = distinct !{![[FOR_NESTED_ENABLE_INNER]], [[MUST_PROGRESS]]}
+// CHECK-DAG: ![[FOR_NESTED_ENABLE_OUTER]] = distinct !{![[FOR_NESTED_ENABLE_OUTER]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[FOR_NESTED2_ENABLE]] = distinct !{![[FOR_NESTED2_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[FOR_NESTED2_1_ENABLE]] = distinct !{![[FOR_NESTED2_1_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[WHILE_DISTINCT]] = distinct !{![[WHILE_DISTINCT]], [[MUST_PROGRESS]], [[WHILE_COUNT]]}
+
+// CHECK-DAG: ![[WHILE_DISABLE]] = distinct !{![[WHILE_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]}
+// CHECK-DAG: ![[WHILE_ENABLE]] = distinct !{![[WHILE_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
+// CHECK-DAG: ![[DO_DISTINCT]] = distinct !{![[DO_DISTINCT]], [[MUST_PROGRESS]], [[DO_COUNT]]}
+
+// CHECK-DAG: ![[DO_DISABLE]] = distinct !{![[DO_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]}
+// CHECK-DAG: ![[DO_ENABLE]] = distinct !{![[DO_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]}
diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
new file mode 100644
index 0000000..2450941
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
+
+typedef vector<float, 3> float3;
+
+StructuredBuffer<float3> Buffer;
+
+// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer BufferErr1;
+
+// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer<> BufferErr2;
+
+[numthreads(1,1,1)]
+void main() {
+ (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer<vector<float, 3> >'}}
+ // expected-note@* {{implicitly declared private here}}
+}
diff --git a/clang/unittests/Tooling/StencilTest.cpp b/clang/unittests/Tooling/StencilTest.cpp
index 26257cf..445912a 100644
--- a/clang/unittests/Tooling/StencilTest.cpp
+++ b/clang/unittests/Tooling/StencilTest.cpp
@@ -565,6 +565,28 @@ TEST_F(StencilTest, DescribeAnonNamespaceType) {
HasValue(std::string(Expected)));
}
+TEST_F(StencilTest, DescribeFunction) {
+ std::string Snippet = "int F(); F();";
+ std::string Expected = "F";
+ auto StmtMatch = matchStmt(Snippet, callExpr(callee(namedDecl().bind("fn"))));
+ ASSERT_TRUE(StmtMatch);
+ EXPECT_THAT_EXPECTED(describe("fn")->eval(StmtMatch->Result),
+ HasValue(std::string(Expected)));
+}
+
+TEST_F(StencilTest, DescribeImplicitOperator) {
+ std::string Snippet = "struct Tag {}; [](Tag){};";
+ std::string Expected = "operator()";
+ auto StmtMatch = matchStmt(
+ Snippet,
+ stmt(hasDescendant(
+ cxxMethodDecl(hasParameter(0, hasType(namedDecl(hasName("Tag")))))
+ .bind("fn"))));
+ ASSERT_TRUE(StmtMatch);
+ EXPECT_THAT_EXPECTED(describe("fn")->eval(StmtMatch->Result),
+ HasValue(std::string(Expected)));
+}
+
TEST_F(StencilTest, RunOp) {
StringRef Id = "id";
auto SimpleFn = [Id](const MatchResult &R) {
diff --git a/clang/utils/TableGen/ASTTableGen.cpp b/clang/utils/TableGen/ASTTableGen.cpp
index 54288ff..4734477 100644
--- a/clang/utils/TableGen/ASTTableGen.cpp
+++ b/clang/utils/TableGen/ASTTableGen.cpp
@@ -31,7 +31,8 @@ llvm::StringRef clang::tblgen::HasProperties::getName() const {
}
}
-static StringRef removeExpectedNodeNameSuffix(Record *node, StringRef suffix) {
+static StringRef removeExpectedNodeNameSuffix(const Record *node,
+ StringRef suffix) {
StringRef nodeName = node->getName();
if (!nodeName.ends_with(suffix)) {
PrintFatalError(node->getLoc(),
@@ -105,8 +106,7 @@ static void visitASTNodeRecursive(ASTNode node, ASTNode base,
}
}
-static void visitHierarchy(RecordKeeper &records,
- StringRef nodeClassName,
+static void visitHierarchy(const RecordKeeper &records, StringRef nodeClassName,
ASTNodeHierarchyVisitor<ASTNode> visit) {
// Check for the node class, just as a basic correctness check.
if (!records.getClass(nodeClassName)) {
@@ -114,13 +114,10 @@ static void visitHierarchy(RecordKeeper &records,
+ nodeClassName);
}
- // Find all the nodes in the hierarchy.
- auto nodes = records.getAllDerivedDefinitions(nodeClassName);
-
- // Derive the child map.
+ // Derive the child map for all nodes in the hierarchy.
ChildMap hierarchy;
ASTNode root;
- for (ASTNode node : nodes) {
+ for (ASTNode node : records.getAllDerivedDefinitions(nodeClassName)) {
if (auto base = node.getBase())
hierarchy.insert(std::make_pair(base, node));
else if (root)
@@ -136,8 +133,8 @@ static void visitHierarchy(RecordKeeper &records,
visitASTNodeRecursive(root, ASTNode(), hierarchy, visit);
}
-void clang::tblgen::visitASTNodeHierarchyImpl(RecordKeeper &records,
- StringRef nodeClassName,
- ASTNodeHierarchyVisitor<ASTNode> visit) {
+void clang::tblgen::visitASTNodeHierarchyImpl(
+ const RecordKeeper &records, StringRef nodeClassName,
+ ASTNodeHierarchyVisitor<ASTNode> visit) {
visitHierarchy(records, nodeClassName, visit);
}
diff --git a/clang/utils/TableGen/ASTTableGen.h b/clang/utils/TableGen/ASTTableGen.h
index 41f78a6..143d779 100644
--- a/clang/utils/TableGen/ASTTableGen.h
+++ b/clang/utils/TableGen/ASTTableGen.h
@@ -87,18 +87,18 @@ namespace clang {
namespace tblgen {
class WrappedRecord {
- llvm::Record *Record;
+ const llvm::Record *Record;
protected:
- WrappedRecord(llvm::Record *record = nullptr) : Record(record) {}
+ WrappedRecord(const llvm::Record *record = nullptr) : Record(record) {}
- llvm::Record *get() const {
+ const llvm::Record *get() const {
assert(Record && "accessing null record");
return Record;
}
public:
- llvm::Record *getRecord() const { return Record; }
+ const llvm::Record *getRecord() const { return Record; }
explicit operator bool() const { return Record != nullptr; }
@@ -144,7 +144,7 @@ class HasProperties : public WrappedRecord {
public:
static constexpr llvm::StringRef ClassName = HasPropertiesClassName;
- HasProperties(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+ HasProperties(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
llvm::StringRef getName() const;
@@ -157,7 +157,7 @@ public:
/// in one of Clang's AST hierarchies.
class ASTNode : public HasProperties {
public:
- ASTNode(llvm::Record *record = nullptr) : HasProperties(record) {}
+ ASTNode(const llvm::Record *record = nullptr) : HasProperties(record) {}
llvm::StringRef getName() const {
return get()->getName();
@@ -180,7 +180,7 @@ public:
class DeclNode : public ASTNode {
public:
- DeclNode(llvm::Record *record = nullptr) : ASTNode(record) {}
+ DeclNode(const llvm::Record *record = nullptr) : ASTNode(record) {}
llvm::StringRef getId() const;
std::string getClassName() const;
@@ -202,7 +202,7 @@ public:
class TypeNode : public ASTNode {
public:
- TypeNode(llvm::Record *record = nullptr) : ASTNode(record) {}
+ TypeNode(const llvm::Record *record = nullptr) : ASTNode(record) {}
llvm::StringRef getId() const;
llvm::StringRef getClassName() const;
@@ -224,7 +224,7 @@ public:
class StmtNode : public ASTNode {
public:
- StmtNode(llvm::Record *record = nullptr) : ASTNode(record) {}
+ StmtNode(const llvm::Record *record = nullptr) : ASTNode(record) {}
std::string getId() const;
llvm::StringRef getClassName() const;
@@ -247,7 +247,7 @@ public:
/// The type of a property.
class PropertyType : public WrappedRecord {
public:
- PropertyType(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+ PropertyType(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
/// Is this a generic specialization (i.e. `Array<T>` or `Optional<T>`)?
bool isGenericSpecialization() const {
@@ -331,7 +331,7 @@ public:
/// A rule for returning the kind of a type.
class TypeKindRule : public WrappedRecord {
public:
- TypeKindRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+ TypeKindRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
/// Return the type to which this applies.
PropertyType getParentType() const {
@@ -361,7 +361,7 @@ public:
/// An implementation case of a property type.
class TypeCase : public HasProperties {
public:
- TypeCase(llvm::Record *record = nullptr) : HasProperties(record) {}
+ TypeCase(const llvm::Record *record = nullptr) : HasProperties(record) {}
/// Return the name of this case.
llvm::StringRef getCaseName() const {
@@ -381,7 +381,7 @@ public:
/// A property of an AST node.
class Property : public WrappedRecord {
public:
- Property(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+ Property(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
/// Return the name of this property.
llvm::StringRef getName() const {
@@ -417,7 +417,8 @@ public:
/// a value (which is actually done when writing the value out).
class ReadHelperRule : public WrappedRecord {
public:
- ReadHelperRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+ ReadHelperRule(const llvm::Record *record = nullptr)
+ : WrappedRecord(record) {}
/// Return the class for which this is a creation rule.
/// Should never be abstract.
@@ -437,7 +438,7 @@ public:
/// A rule for how to create an AST node from its properties.
class CreationRule : public WrappedRecord {
public:
- CreationRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+ CreationRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
/// Return the class for which this is a creation rule.
/// Should never be abstract.
@@ -457,7 +458,7 @@ public:
/// A rule which overrides the standard rules for serializing an AST node.
class OverrideRule : public WrappedRecord {
public:
- OverrideRule(llvm::Record *record = nullptr) : WrappedRecord(record) {}
+ OverrideRule(const llvm::Record *record = nullptr) : WrappedRecord(record) {}
/// Return the class for which this is an override rule.
/// Should never be abstract.
@@ -483,12 +484,12 @@ template <class NodeClass>
using ASTNodeHierarchyVisitor =
llvm::function_ref<void(NodeClass node, NodeClass base)>;
-void visitASTNodeHierarchyImpl(llvm::RecordKeeper &records,
+void visitASTNodeHierarchyImpl(const llvm::RecordKeeper &records,
llvm::StringRef nodeClassName,
ASTNodeHierarchyVisitor<ASTNode> visit);
template <class NodeClass>
-void visitASTNodeHierarchy(llvm::RecordKeeper &records,
+void visitASTNodeHierarchy(const llvm::RecordKeeper &records,
ASTNodeHierarchyVisitor<NodeClass> visit) {
visitASTNodeHierarchyImpl(records, NodeClass::getTableGenNodeClassName(),
[visit](ASTNode node, ASTNode base) {
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index 94f12a0..4ae7600 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -345,7 +345,7 @@ void EmitBuiltin(llvm::raw_ostream &OS, const Record *Builtin) {
}
} // namespace
-void clang::EmitClangBuiltins(llvm::RecordKeeper &Records,
+void clang::EmitClangBuiltins(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS) {
emitSourceFileHeader("List of builtins that Clang recognizes", OS);
diff --git a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
index aee7d38..1a2503d 100644
--- a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp
@@ -20,16 +20,16 @@
using namespace llvm;
-void clang::EmitClangCommentCommandInfo(RecordKeeper &Records,
+void clang::EmitClangCommentCommandInfo(const RecordKeeper &Records,
raw_ostream &OS) {
emitSourceFileHeader("A list of commands useable in documentation comments",
OS, Records);
OS << "namespace {\n"
"const CommandInfo Commands[] = {\n";
- std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Command");
+ ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command");
for (size_t i = 0, e = Tags.size(); i != e; ++i) {
- Record &Tag = *Tags[i];
+ const Record &Tag = *Tags[i];
OS << " { "
<< "\"" << Tag.getValueAsString("Name") << "\", "
<< "\"" << Tag.getValueAsString("EndCommandName") << "\", " << i << ", "
@@ -62,7 +62,7 @@ void clang::EmitClangCommentCommandInfo(RecordKeeper &Records,
std::vector<StringMatcher::StringPair> Matches;
for (size_t i = 0, e = Tags.size(); i != e; ++i) {
- Record &Tag = *Tags[i];
+ const Record &Tag = *Tags[i];
std::string Name = std::string(Tag.getValueAsString("Name"));
std::string Return;
raw_string_ostream(Return) << "return &Commands[" << i << "];";
@@ -112,7 +112,7 @@ static std::string MangleName(StringRef Str) {
return Mangled;
}
-void clang::EmitClangCommentCommandList(RecordKeeper &Records,
+void clang::EmitClangCommentCommandList(const RecordKeeper &Records,
raw_ostream &OS) {
emitSourceFileHeader("A list of commands useable in documentation comments",
OS, Records);
@@ -121,9 +121,9 @@ void clang::EmitClangCommentCommandList(RecordKeeper &Records,
<< "# define COMMENT_COMMAND(NAME)\n"
<< "#endif\n";
- std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Command");
+ ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Command");
for (size_t i = 0, e = Tags.size(); i != e; ++i) {
- Record &Tag = *Tags[i];
+ const Record &Tag = *Tags[i];
std::string MangledName = MangleName(Tag.getValueAsString("Name"));
OS << "COMMENT_COMMAND(" << MangledName << ")\n";
diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
index f1cd9af..bd75b3f 100644
--- a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
@@ -46,21 +46,17 @@ static bool translateCodePointToUTF8(unsigned CodePoint,
return true;
}
-void clang::EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
- raw_ostream &OS) {
- std::vector<Record *> Tags = Records.getAllDerivedDefinitions("NCR");
+void clang::EmitClangCommentHTMLNamedCharacterReferences(
+ const RecordKeeper &Records, raw_ostream &OS) {
std::vector<StringMatcher::StringPair> NameToUTF8;
SmallString<32> CLiteral;
- for (std::vector<Record *>::iterator I = Tags.begin(), E = Tags.end();
- I != E; ++I) {
- Record &Tag = **I;
- std::string Spelling = std::string(Tag.getValueAsString("Spelling"));
- uint64_t CodePoint = Tag.getValueAsInt("CodePoint");
+ for (const Record *Tag : Records.getAllDerivedDefinitions("NCR")) {
+ std::string Spelling = std::string(Tag->getValueAsString("Spelling"));
+ uint64_t CodePoint = Tag->getValueAsInt("CodePoint");
CLiteral.clear();
CLiteral.append("return ");
if (!translateCodePointToUTF8(CodePoint, CLiteral)) {
- SrcMgr.PrintMessage(Tag.getLoc().front(),
- SourceMgr::DK_Error,
+ SrcMgr.PrintMessage(Tag->getLoc().front(), SourceMgr::DK_Error,
Twine("invalid code point"));
continue;
}
diff --git a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp
index 3dc1098..a457315 100644
--- a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp
@@ -19,10 +19,11 @@
using namespace llvm;
-void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) {
- std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Tag");
+void clang::EmitClangCommentHTMLTags(const RecordKeeper &Records,
+ raw_ostream &OS) {
+ ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Tag");
std::vector<StringMatcher::StringPair> Matches;
- for (Record *Tag : Tags) {
+ for (const Record *Tag : Tags) {
Matches.emplace_back(std::string(Tag->getValueAsString("Spelling")),
"return true;");
}
@@ -35,12 +36,12 @@ void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) {
<< "}\n\n";
}
-void clang::EmitClangCommentHTMLTagsProperties(RecordKeeper &Records,
+void clang::EmitClangCommentHTMLTagsProperties(const RecordKeeper &Records,
raw_ostream &OS) {
- std::vector<Record *> Tags = Records.getAllDerivedDefinitions("Tag");
+ ArrayRef<const Record *> Tags = Records.getAllDerivedDefinitions("Tag");
std::vector<StringMatcher::StringPair> MatchesEndTagOptional;
std::vector<StringMatcher::StringPair> MatchesEndTagForbidden;
- for (Record *Tag : Tags) {
+ for (const Record *Tag : Tags) {
std::string Spelling = std::string(Tag->getValueAsString("Spelling"));
StringMatcher::StringPair Match(Spelling, "return true;");
if (Tag->getValueAsBit("EndTagOptional"))
diff --git a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp
index 4508293..dae6710 100644
--- a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp
@@ -4,7 +4,7 @@
using namespace llvm;
-void clang::EmitClangDataCollectors(RecordKeeper &RK, raw_ostream &OS) {
+void clang::EmitClangDataCollectors(const RecordKeeper &RK, raw_ostream &OS) {
const auto &Defs = RK.getClasses();
for (const auto &Entry : Defs) {
Record &R = *Entry.second;
diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
index 120e1e2..7e426d5 100644
--- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp
@@ -20,11 +20,11 @@ using namespace llvm;
namespace {
class ClangOpcodesEmitter {
- RecordKeeper &Records;
+ const RecordKeeper &Records;
unsigned NumTypes;
public:
- ClangOpcodesEmitter(RecordKeeper &R)
+ ClangOpcodesEmitter(const RecordKeeper &R)
: Records(R), NumTypes(Records.getAllDerivedDefinitions("Type").size()) {}
void run(raw_ostream &OS);
@@ -404,6 +404,6 @@ void ClangOpcodesEmitter::PrintTypes(raw_ostream &OS,
OS << ">";
}
-void clang::EmitClangOpcodes(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitClangOpcodes(const RecordKeeper &Records, raw_ostream &OS) {
ClangOpcodesEmitter(Records).run(OS);
}
diff --git a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
index 74c3a85..d68dcc4 100644
--- a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
+++ b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp
@@ -87,7 +87,7 @@ struct BuiltinTableEntries {
//
class BuiltinNameEmitter {
public:
- BuiltinNameEmitter(RecordKeeper &Records, raw_ostream &OS)
+ BuiltinNameEmitter(const RecordKeeper &Records, raw_ostream &OS)
: Records(Records), OS(OS) {}
// Entrypoint to generate the functions and structures for checking
@@ -100,7 +100,7 @@ private:
// Contains OpenCL builtin functions and related information, stored as
// Record instances. They are coming from the associated TableGen file.
- RecordKeeper &Records;
+ const RecordKeeper &Records;
// The output file.
raw_ostream &OS;
@@ -113,7 +113,7 @@ private:
// \param Output (out) String containing the enums to emit in the output file.
// \param List (out) List containing the extracted Types, except the Types in
// TypesSeen.
- void ExtractEnumTypes(std::vector<Record *> &Types,
+ void ExtractEnumTypes(ArrayRef<const Record *> Types,
StringMap<bool> &TypesSeen, std::string &Output,
std::vector<const Record *> &List);
@@ -237,7 +237,7 @@ private:
/// Base class for emitting a file (e.g. header or test) from OpenCLBuiltins.td
class OpenCLBuiltinFileEmitterBase {
public:
- OpenCLBuiltinFileEmitterBase(RecordKeeper &Records, raw_ostream &OS)
+ OpenCLBuiltinFileEmitterBase(const RecordKeeper &Records, raw_ostream &OS)
: Records(Records), OS(OS) {}
virtual ~OpenCLBuiltinFileEmitterBase() = default;
@@ -305,7 +305,7 @@ protected:
// Contains OpenCL builtin functions and related information, stored as
// Record instances. They are coming from the associated TableGen file.
- RecordKeeper &Records;
+ const RecordKeeper &Records;
// The output file.
raw_ostream &OS;
@@ -316,7 +316,7 @@ protected:
// builtin function described in the .td input.
class OpenCLBuiltinTestEmitter : public OpenCLBuiltinFileEmitterBase {
public:
- OpenCLBuiltinTestEmitter(RecordKeeper &Records, raw_ostream &OS)
+ OpenCLBuiltinTestEmitter(const RecordKeeper &Records, raw_ostream &OS)
: OpenCLBuiltinFileEmitterBase(Records, OS) {}
// Entrypoint to generate the functions for testing all OpenCL builtin
@@ -329,7 +329,7 @@ public:
// prototype for each builtin function described in the .td input.
class OpenCLBuiltinHeaderEmitter : public OpenCLBuiltinFileEmitterBase {
public:
- OpenCLBuiltinHeaderEmitter(RecordKeeper &Records, raw_ostream &OS)
+ OpenCLBuiltinHeaderEmitter(const RecordKeeper &Records, raw_ostream &OS)
: OpenCLBuiltinFileEmitterBase(Records, OS) {}
// Entrypoint to generate the header.
@@ -362,7 +362,7 @@ void BuiltinNameEmitter::Emit() {
EmitQualTypeFinder();
}
-void BuiltinNameEmitter::ExtractEnumTypes(std::vector<Record *> &Types,
+void BuiltinNameEmitter::ExtractEnumTypes(ArrayRef<const Record *> Types,
StringMap<bool> &TypesSeen,
std::string &Output,
std::vector<const Record *> &List) {
@@ -392,11 +392,11 @@ void BuiltinNameEmitter::EmitDeclarations() {
// Extract generic types and non-generic types separately, to keep
// gentypes at the end of the enum which simplifies the special handling
// for gentypes in SemaLookup.
- std::vector<Record *> GenTypes =
+ ArrayRef<const Record *> GenTypes =
Records.getAllDerivedDefinitions("GenericType");
ExtractEnumTypes(GenTypes, TypesSeen, GenTypeEnums, GenTypeList);
- std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type");
+ ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type");
ExtractEnumTypes(Types, TypesSeen, TypeEnums, TypeList);
OS << TypeEnums;
@@ -499,7 +499,7 @@ static void VerifySignature(const std::vector<Record *> &Signature,
void BuiltinNameEmitter::GetOverloads() {
// Populate the TypeMap.
- std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type");
+ ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type");
unsigned I = 0;
for (const auto &T : Types) {
TypeMap.insert(std::make_pair(T, I++));
@@ -507,7 +507,8 @@ void BuiltinNameEmitter::GetOverloads() {
// Populate the SignaturesList and the FctOverloadMap.
unsigned CumulativeSignIndex = 0;
- std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin");
+ ArrayRef<const Record *> Builtins =
+ Records.getAllDerivedDefinitions("Builtin");
for (const auto *B : Builtins) {
StringRef BName = B->getValueAsString("Name");
FctOverloadMap.try_emplace(BName);
@@ -535,7 +536,7 @@ void BuiltinNameEmitter::GetOverloads() {
void BuiltinNameEmitter::EmitExtensionTable() {
OS << "static const char *FunctionExtensionTable[] = {\n";
unsigned Index = 0;
- std::vector<Record *> FuncExtensions =
+ ArrayRef<const Record *> FuncExtensions =
Records.getAllDerivedDefinitions("FunctionExtension");
for (const auto &FE : FuncExtensions) {
@@ -804,11 +805,11 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty,
OS << "\n switch (Ty.ID) {\n";
// Switch cases for image types (Image2d, Image3d, ...)
- std::vector<Record *> ImageTypes =
+ ArrayRef<const Record *> ImageTypes =
Records.getAllDerivedDefinitions("ImageType");
// Map an image type name to its 3 access-qualified types (RO, WO, RW).
- StringMap<SmallVector<Record *, 3>> ImageTypesMap;
+ StringMap<SmallVector<const Record *, 3>> ImageTypesMap;
for (auto *IT : ImageTypes)
ImageTypesMap[IT->getValueAsString("Name")].push_back(IT);
@@ -890,7 +891,7 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty,
// Switch cases for non generic, non image types (int, int4, float, ...).
// Only insert the plain scalar type; vector information and type qualifiers
// are added in step 2.
- std::vector<Record *> Types = Records.getAllDerivedDefinitions("Type");
+ ArrayRef<const Record *> Types = Records.getAllDerivedDefinitions("Type");
StringMap<bool> TypesSeen;
for (const auto *T : Types) {
@@ -1211,7 +1212,8 @@ void OpenCLBuiltinTestEmitter::emit() {
unsigned TestID = 0;
// Iterate over all builtins.
- std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin");
+ ArrayRef<const Record *> Builtins =
+ Records.getAllDerivedDefinitions("Builtin");
for (const auto *B : Builtins) {
StringRef Name = B->getValueAsString("Name");
@@ -1274,7 +1276,8 @@ void OpenCLBuiltinHeaderEmitter::emit() {
)";
// Iterate over all builtins; sort to follow order of definition in .td file.
- std::vector<Record *> Builtins = Records.getAllDerivedDefinitions("Builtin");
+ std::vector<const Record *> Builtins =
+ Records.getAllDerivedDefinitions("Builtin");
llvm::sort(Builtins, LessRecord());
for (const auto *B : Builtins) {
@@ -1319,18 +1322,19 @@ void OpenCLBuiltinHeaderEmitter::emit() {
"#pragma OPENCL EXTENSION all : disable\n";
}
-void clang::EmitClangOpenCLBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitClangOpenCLBuiltins(const RecordKeeper &Records,
+ raw_ostream &OS) {
BuiltinNameEmitter NameChecker(Records, OS);
NameChecker.Emit();
}
-void clang::EmitClangOpenCLBuiltinHeader(RecordKeeper &Records,
+void clang::EmitClangOpenCLBuiltinHeader(const RecordKeeper &Records,
raw_ostream &OS) {
OpenCLBuiltinHeaderEmitter HeaderFileGenerator(Records, OS);
HeaderFileGenerator.emit();
}
-void clang::EmitClangOpenCLBuiltinTests(RecordKeeper &Records,
+void clang::EmitClangOpenCLBuiltinTests(const RecordKeeper &Records,
raw_ostream &OS) {
OpenCLBuiltinTestEmitter TestFileGenerator(Records, OS);
TestFileGenerator.emit();
diff --git a/clang/utils/TableGen/ClangOptionDocEmitter.cpp b/clang/utils/TableGen/ClangOptionDocEmitter.cpp
index 8683561..8c32f02 100644
--- a/clang/utils/TableGen/ClangOptionDocEmitter.cpp
+++ b/clang/utils/TableGen/ClangOptionDocEmitter.cpp
@@ -24,8 +24,8 @@ using namespace llvm;
namespace {
struct DocumentedOption {
- Record *Option;
- std::vector<Record*> Aliases;
+ const Record *Option;
+ std::vector<const Record *> Aliases;
};
struct DocumentedGroup;
struct Documentation {
@@ -37,7 +37,7 @@ struct Documentation {
}
};
struct DocumentedGroup : Documentation {
- Record *Group;
+ const Record *Group;
};
static bool hasFlag(const Record *Option, StringRef OptionFlag,
@@ -63,25 +63,25 @@ static bool isOptionVisible(const Record *Option, const Record *DocInfo) {
}
// Reorganize the records into a suitable form for emitting documentation.
-Documentation extractDocumentation(RecordKeeper &Records,
+Documentation extractDocumentation(const RecordKeeper &Records,
const Record *DocInfo) {
Documentation Result;
// Build the tree of groups. The root in the tree is the fake option group
// (Record*)nullptr, which contains all top-level groups and options.
- std::map<Record*, std::vector<Record*> > OptionsInGroup;
- std::map<Record*, std::vector<Record*> > GroupsInGroup;
- std::map<Record*, std::vector<Record*> > Aliases;
+ std::map<const Record *, std::vector<const Record *>> OptionsInGroup;
+ std::map<const Record *, std::vector<const Record *>> GroupsInGroup;
+ std::map<const Record *, std::vector<const Record *>> Aliases;
- std::map<std::string, Record*> OptionsByName;
- for (Record *R : Records.getAllDerivedDefinitions("Option"))
+ std::map<std::string, const Record *> OptionsByName;
+ for (const Record *R : Records.getAllDerivedDefinitions("Option"))
OptionsByName[std::string(R->getValueAsString("Name"))] = R;
- auto Flatten = [](Record *R) {
+ auto Flatten = [](const Record *R) {
return R->getValue("DocFlatten") && R->getValueAsBit("DocFlatten");
};
- auto SkipFlattened = [&](Record *R) -> Record* {
+ auto SkipFlattened = [&](const Record *R) -> const Record * {
while (R && Flatten(R)) {
auto *G = dyn_cast<DefInit>(R->getValueInit("Group"));
if (!G)
@@ -91,17 +91,17 @@ Documentation extractDocumentation(RecordKeeper &Records,
return R;
};
- for (Record *R : Records.getAllDerivedDefinitions("OptionGroup")) {
+ for (const Record *R : Records.getAllDerivedDefinitions("OptionGroup")) {
if (Flatten(R))
continue;
- Record *Group = nullptr;
+ const Record *Group = nullptr;
if (auto *G = dyn_cast<DefInit>(R->getValueInit("Group")))
Group = SkipFlattened(G->getDef());
GroupsInGroup[Group].push_back(R);
}
- for (Record *R : Records.getAllDerivedDefinitions("Option")) {
+ for (const Record *R : Records.getAllDerivedDefinitions("Option")) {
if (auto *A = dyn_cast<DefInit>(R->getValueInit("Alias"))) {
Aliases[A->getDef()].push_back(R);
continue;
@@ -120,33 +120,33 @@ Documentation extractDocumentation(RecordKeeper &Records,
}
}
- Record *Group = nullptr;
+ const Record *Group = nullptr;
if (auto *G = dyn_cast<DefInit>(R->getValueInit("Group")))
Group = SkipFlattened(G->getDef());
OptionsInGroup[Group].push_back(R);
}
- auto CompareByName = [](Record *A, Record *B) {
+ auto CompareByName = [](const Record *A, const Record *B) {
return A->getValueAsString("Name") < B->getValueAsString("Name");
};
- auto CompareByLocation = [](Record *A, Record *B) {
+ auto CompareByLocation = [](const Record *A, const Record *B) {
return A->getLoc()[0].getPointer() < B->getLoc()[0].getPointer();
};
- auto DocumentationForOption = [&](Record *R) -> DocumentedOption {
+ auto DocumentationForOption = [&](const Record *R) -> DocumentedOption {
auto &A = Aliases[R];
llvm::sort(A, CompareByName);
return {R, std::move(A)};
};
- std::function<Documentation(Record *)> DocumentationForGroup =
- [&](Record *R) -> Documentation {
+ std::function<Documentation(const Record *)> DocumentationForGroup =
+ [&](const Record *R) -> Documentation {
Documentation D;
auto &Groups = GroupsInGroup[R];
llvm::sort(Groups, CompareByLocation);
- for (Record *G : Groups) {
+ for (const Record *G : Groups) {
D.Groups.emplace_back();
D.Groups.back().Group = G;
Documentation &Base = D.Groups.back();
@@ -157,7 +157,7 @@ Documentation extractDocumentation(RecordKeeper &Records,
auto &Options = OptionsInGroup[R];
llvm::sort(Options, CompareByName);
- for (Record *O : Options)
+ for (const Record *O : Options)
if (isOptionVisible(O, DocInfo))
D.Options.push_back(DocumentationForOption(O));
@@ -444,7 +444,7 @@ void emitDocumentation(int Depth, const Documentation &Doc,
} // namespace
-void clang::EmitClangOptDocs(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitClangOptDocs(const RecordKeeper &Records, raw_ostream &OS) {
const Record *DocInfo = Records.getDef("GlobalDocumentation");
if (!DocInfo) {
PrintFatalError("The GlobalDocumentation top-level definition is missing, "
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 3a424c9..fe55ef2 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -73,7 +73,8 @@ void EmitClangAttrNodeTraverse(llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
void EmitClangAttrDocTable(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangBuiltins(const llvm::RecordKeeper &Records,
+ llvm::raw_ostream &OS);
void EmitClangDiagsDefs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS,
const std::string &Component);
@@ -83,18 +84,18 @@ void EmitClangDiagsIndexName(llvm::RecordKeeper &Records,
void EmitClangSACheckers(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangCommentHTMLTags(llvm::RecordKeeper &Records,
+void EmitClangCommentHTMLTags(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
-void EmitClangCommentHTMLTagsProperties(llvm::RecordKeeper &Records,
+void EmitClangCommentHTMLTagsProperties(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
-void EmitClangCommentHTMLNamedCharacterReferences(llvm::RecordKeeper &Records,
- llvm::raw_ostream &OS);
+void EmitClangCommentHTMLNamedCharacterReferences(
+ const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangCommentCommandInfo(llvm::RecordKeeper &Records,
+void EmitClangCommentCommandInfo(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
-void EmitClangCommentCommandList(llvm::RecordKeeper &Records,
+void EmitClangCommentCommandList(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
-void EmitClangOpcodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangOpcodes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitClangSyntaxNodeList(llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
@@ -142,16 +143,16 @@ void EmitCdeBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitClangAttrDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitClangDiagDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangOptDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangOptDocs(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitClangOpenCLBuiltins(llvm::RecordKeeper &Records,
+void EmitClangOpenCLBuiltins(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
-void EmitClangOpenCLBuiltinHeader(llvm::RecordKeeper &Records,
+void EmitClangOpenCLBuiltinHeader(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
-void EmitClangOpenCLBuiltinTests(llvm::RecordKeeper &Records,
+void EmitClangOpenCLBuiltinTests(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
-void EmitClangDataCollectors(llvm::RecordKeeper &Records,
+void EmitClangDataCollectors(const llvm::RecordKeeper &Records,
llvm::raw_ostream &OS);
void EmitTestPragmaAttributeSupportedAttributes(llvm::RecordKeeper &Records,
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index 0dd3977..ea2da23 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -14,7 +14,7 @@
#include "aarch64.h"
-#if !defined(__aarch64__)
+#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64)
#error This file is intended only for aarch64-based targets
#endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
index bf84a2f..666e6f3 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp
@@ -151,10 +151,6 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res,
// This may happen inside the DTOR of main thread, so just ignore it.
tls_size = 0;
}
- if (tls_size) {
- CHECK_LE(tls_beg, reinterpret_cast<uptr>(res) - kDtvOffset);
- CHECK_LT(reinterpret_cast<uptr>(res) - kDtvOffset, tls_beg + tls_size);
- }
dtv->beg = tls_beg;
dtv->size = tls_size;
return dtv;
diff --git a/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c b/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c
index 857bec9..fd1ce0c 100644
--- a/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c
+++ b/compiler-rt/test/asan/TestCases/Linux/stress_dtls.c
@@ -11,8 +11,8 @@
// RUN: %clangxx_asan -x c -DSO_NAME=f1 %s -shared -o %t-f1.so -fPIC
// RUN: %clangxx_asan -x c -DSO_NAME=f2 %s -shared -o %t-f2.so -fPIC
// RUN: %clangxx_asan %s -ldl -pthread -o %t
-// RUN: %env_asan_opts=verbosity=2 %run not %t 0 3
-// RUN: %env_asan_opts=verbosity=2 %run %t 2 3
+// RUN: %run %t 0 3
+// RUN: %run %t 2 3
// RUN: %env_asan_opts=verbosity=2 %run %t 10 2 2>&1 | FileCheck %s
// RUN: %env_asan_opts=verbosity=2:intercept_tls_get_addr=1 %run %t 10 2 2>&1 | FileCheck %s
// RUN: %env_asan_opts=verbosity=2:intercept_tls_get_addr=0 %run %t 10 2 2>&1 | FileCheck %s --check-prefix=CHECK0
@@ -29,7 +29,6 @@
// CHECK-NOT: num_live_dtls 5
//
// CHECK0-NOT: __tls_get_addr
-
/*
cc=your-compiler
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 10aef72..e94a49f 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -2744,7 +2744,6 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
(!procedure->IsElemental() && nonElemental)) {
int d{ComputeCudaMatchingDistance(
context_.languageFeatures(), *procedure, localActuals)};
- llvm::errs() << "matching distance: " << d << "\n";
if (d != crtMatchingDistance) {
if (d > crtMatchingDistance) {
continue;
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 45dfe3e..8643c9b 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -75,6 +75,10 @@ function(_get_compile_options_from_config output_var)
list(APPEND config_options "-DLIBC_TYPES_TIME_T_IS_32_BIT")
endif()
+ if(LIBC_ADD_NULL_CHECKS)
+ list(APPEND config_options "-DLIBC_ADD_NULL_CHECKS")
+ endif()
+
set(${output_var} ${config_options} PARENT_SCOPE)
endfunction(_get_compile_options_from_config)
diff --git a/libc/config/config.json b/libc/config/config.json
index 2e72c0a..7dfbb56 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -94,5 +94,11 @@
"value": false,
"doc": "Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit."
}
+ },
+ "general": {
+ "LIBC_ADD_NULL_CHECKS": {
+ "value": true,
+ "doc": "Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior."
+ }
}
}
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 54ca5d5..86875d4 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -30,6 +30,8 @@ to learn about the defaults for your platform and target.
- ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience.
* **"errno" options**
- ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM.
+* **"general" options**
+ - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior.
* **"math" options**
- ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST.
* **"printf" options**
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index a2fad9b..e0b65b7 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -143,4 +143,22 @@ add_proxy_header_library(
libc.include.llvm-libc-macros.limits_macros
)
+add_proxy_header_library(
+ link_macros
+ HDRS
+ link_macros.h
+ FULL_BUILD_DEPENDS
+ libc.include.llvm-libc-macros.link_macros
+ libc.include.link
+)
+
+add_proxy_header_library(
+ sys_auxv_macros
+ HDRS
+ sys_auxv_macros.h
+ FULL_BUILD_DEPENDS
+ libc.include.llvm-libc-macros.sys_auxv_macros
+ libc.include.sys_auxv
+)
+
add_subdirectory(types)
diff --git a/libc/hdr/link_macros.h b/libc/hdr/link_macros.h
new file mode 100644
index 0000000..8a78a86
--- /dev/null
+++ b/libc/hdr/link_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from link.h ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_LINK_MACROS_H
+#define LLVM_LIBC_HDR_LINK_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/link-macros.h"
+
+#else // Overlay mode
+
+#include <link.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_LINK_MACROS_H
diff --git a/libc/hdr/sys_auxv_macros.h b/libc/hdr/sys_auxv_macros.h
new file mode 100644
index 0000000..c04011b
--- /dev/null
+++ b/libc/hdr/sys_auxv_macros.h
@@ -0,0 +1,22 @@
+//===-- Definition of macros from sys/auxv.h ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_SYS_AUXV_MACROS_H
+#define LLVM_LIBC_HDR_SYS_AUXV_MACROS_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-macros/sys-auxv-macros.h"
+
+#else // Overlay mode
+
+#include <sys/auxv.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_SYS_AUXV_MACROS_H
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 9bd1e29..0302ad6 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -192,6 +192,9 @@ add_header_library(
libc.src.__support.CPP.optional
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.rounding_mode
+ libc.src.__support.macros.config
+ libc.src.__support.macros.null_check
+ libc.src.__support.macros.optimization
libc.src.errno.errno
)
diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt
index 089cad4..6c70149 100644
--- a/libc/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt
@@ -23,3 +23,33 @@ add_object_library(
libc.hdr.types.struct_f_owner_ex
libc.hdr.types.off_t
)
+
+add_header_library(
+ vdso_sym
+ HDRS
+ vdso_sym.h
+ DEPENDS
+ libc.src.__support.common
+)
+
+add_object_library(
+ vdso
+ HDRS
+ vdso.h
+ SRCS
+ vdso.cpp
+ DEPENDS
+ .${LIBC_TARGET_ARCHITECTURE}.vdso
+ libc.src.__support.CPP.array
+ libc.src.__support.CPP.optional
+ libc.src.__support.CPP.string_view
+ libc.src.__support.threads.callonce
+ libc.src.__support.threads.linux.futex_word_type
+ libc.hdr.types.struct_timeval
+ libc.hdr.types.struct_timespec
+ libc.hdr.types.clockid_t
+ libc.hdr.types.time_t
+ libc.hdr.link_macros
+ libc.src.errno.errno
+ libc.src.sys.auxv.getauxval
+)
diff --git a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt
index eea9bad..d9451a1 100644
--- a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
DEPENDS
libc.src.__support.common
)
+
+add_header_library(
+ vdso
+ HDRS
+ vdso.h
+ DEPENDS
+ libc.src.__support.common
+ libc.src.__support.CPP.string_view
+ libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/aarch64/vdso.h b/libc/src/__support/OSUtil/linux/aarch64/vdso.h
new file mode 100644
index 0000000..3c4c620
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/aarch64/vdso.h
@@ -0,0 +1,37 @@
+//===---------- aarch64 vdso configuration ------------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/kernel/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+ switch (sym) {
+ case VDSOSym::RTSigReturn:
+ return "__kernel_rt_sigreturn";
+ case VDSOSym::GetTimeOfDay:
+ return "__kernel_gettimeofday";
+ case VDSOSym::ClockGetTime:
+ return "__kernel_clock_gettime";
+ case VDSOSym::ClockGetRes:
+ return "__kernel_clock_getres";
+ default:
+ return "";
+ }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+ return "LINUX_2.6.39";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt
index 733366f..d991f7e 100644
--- a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
DEPENDS
libc.src.__support.common
)
+
+add_header_library(
+ vdso
+ HDRS
+ vdso.h
+ DEPENDS
+ libc.src.__support.common
+ libc.src.__support.CPP.string_view
+ libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/arm/vdso.h b/libc/src/__support/OSUtil/linux/arm/vdso.h
new file mode 100644
index 0000000..3de5860
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/arm/vdso.h
@@ -0,0 +1,37 @@
+//===---------- arm vdso configuration ----------------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+ switch (sym) {
+ case VDSOSym::ClockGetTime:
+ return "__vdso_clock_gettime";
+ case VDSOSym::GetTimeOfDay:
+ return "__vdso_gettimeofday";
+ case VDSOSym::ClockGetRes:
+ return "__vdso_clock_getres";
+ case VDSOSym::ClockGetTime64:
+ return "__vdso_clock_gettime64";
+ default:
+ return "";
+ }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+ return "LINUX_2.6";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt
index e271204..eb93dd4 100644
--- a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
DEPENDS
libc.src.__support.common
)
+
+add_header_library(
+ vdso
+ HDRS
+ vdso.h
+ DEPENDS
+ libc.src.__support.common
+ libc.src.__support.CPP.string_view
+ libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/riscv/vdso.h b/libc/src/__support/OSUtil/linux/riscv/vdso.h
new file mode 100644
index 0000000..24ddb25
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/riscv/vdso.h
@@ -0,0 +1,43 @@
+//===---------- RISC-V vdso configuration -------------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/riscv/kernel/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+ switch (sym) {
+ case VDSOSym::RTSigReturn:
+ return "__vdso_rt_sigreturn";
+ case VDSOSym::GetTimeOfDay:
+ return "__vdso_gettimeofday";
+ case VDSOSym::ClockGetTime:
+ return "__vdso_clock_gettime";
+ case VDSOSym::ClockGetRes:
+ return "__vdso_clock_getres";
+ case VDSOSym::GetCpu:
+ return "__vdso_getcpu";
+ case VDSOSym::FlushICache:
+ return "__vdso_flush_icache";
+ case VDSOSym::RiscvHwProbe:
+ return "__vdso_riscv_hwprobe";
+ default:
+ return "";
+ }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+ return "LINUX_4.15";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp
new file mode 100644
index 0000000..cb43764
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/vdso.cpp
@@ -0,0 +1,237 @@
+//===------------- Linux VDSO Implementation --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "src/__support/OSUtil/linux/vdso.h"
+#include "hdr/link_macros.h"
+#include "hdr/sys_auxv_macros.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/threads/callonce.h"
+#include "src/__support/threads/linux/futex_word.h"
+#include "src/errno/libc_errno.h"
+#include "src/sys/auxv/getauxval.h"
+#include <linux/auxvec.h>
+
+// TODO: This is a temporary workaround to avoid including elf.h
+// Include our own headers for ElfW and friends once we have them.
+namespace LIBC_NAMESPACE_DECL {
+
+namespace vdso {
+
+Symbol::VDSOArray Symbol::global_cache{};
+CallOnceFlag Symbol::once_flag = callonce_impl::NOT_CALLED;
+
+namespace {
+// See https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symverdefs.html
+struct Verdaux {
+ ElfW(Word) vda_name; /* Version or dependency names */
+ ElfW(Word) vda_next; /* Offset in bytes to next verdaux
+ entry */
+};
+struct Verdef {
+ ElfW(Half) vd_version; /* Version revision */
+ ElfW(Half) vd_flags; /* Version information */
+ ElfW(Half) vd_ndx; /* Version Index */
+ ElfW(Half) vd_cnt; /* Number of associated aux entries */
+ ElfW(Word) vd_hash; /* Version name hash value */
+ ElfW(Word) vd_aux; /* Offset in bytes to verdaux array */
+ ElfW(Word) vd_next; /* Offset in bytes to next verdef entry */
+ Verdef *next() const {
+ if (vd_next == 0)
+ return nullptr;
+ return reinterpret_cast<Verdef *>(reinterpret_cast<uintptr_t>(this) +
+ vd_next);
+ }
+ Verdaux *aux() const {
+ return reinterpret_cast<Verdaux *>(reinterpret_cast<uintptr_t>(this) +
+ vd_aux);
+ }
+};
+
+// version search procedure specified by
+// https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symversion.html#SYMVERTBL
+cpp::string_view find_version(Verdef *verdef, ElfW(Half) * versym,
+ const char *strtab, size_t idx) {
+ constexpr ElfW(Half) VER_FLG_BASE = 0x1;
+ if (!versym)
+ return "";
+ ElfW(Half) identifier = versym[idx] & 0x7FFF;
+ // iterate through all version definitions
+ for (Verdef *def = verdef; def != nullptr; def = def->next()) {
+ // skip if this is a file-level version
+ if (def->vd_flags & VER_FLG_BASE)
+ continue;
+ // check if the version identifier matches. Highest bit is used to determine
+ // whether the symbol is local. Only lower 15 bits are used for version
+ // identifier.
+ if ((def->vd_ndx & 0x7FFF) == identifier) {
+ Verdaux *aux = def->aux();
+ return strtab + aux->vda_name;
+ }
+ }
+ return "";
+}
+
+size_t shdr_get_symbol_count(ElfW(Shdr) * vdso_shdr, size_t e_shnum) {
+ if (!vdso_shdr)
+ return 0;
+ // iterate all sections until we locate the dynamic symbol section
+ for (size_t i = 0; i < e_shnum; ++i) {
+ // dynamic symbol section is a table section
+ // therefore, the number of entries can be computed as the ratio
+ // of the section size to the size of a single entry
+ if (vdso_shdr[i].sh_type == SHT_DYNSYM)
+ return vdso_shdr[i].sh_size / vdso_shdr[i].sh_entsize;
+ }
+ return 0;
+}
+
+struct VDSOSymbolTable {
+ const char *strtab;
+ ElfW(Sym) * symtab;
+ // The following can be nullptr if the vDSO does not have versioning
+ ElfW(Half) * versym;
+ Verdef *verdef;
+
+ void populate_symbol_cache(Symbol::VDSOArray &symbol_table,
+ size_t symbol_count, ElfW(Addr) vdso_addr) {
+ for (size_t i = 0, e = symbol_table.size(); i < e; ++i) {
+ Symbol sym = i;
+ cpp::string_view name = sym.name();
+ cpp::string_view version = sym.version();
+ if (name.empty())
+ continue;
+
+ for (size_t j = 0; j < symbol_count; ++j) {
+ if (name == strtab + symtab[j].st_name) {
+ // we find a symbol with desired name
+ // now we need to check if it has the right version
+ if (versym && verdef &&
+ version != find_version(verdef, versym, strtab, j))
+ continue;
+
+ // put the symbol address into the symbol table
+ symbol_table[i] =
+ reinterpret_cast<void *>(vdso_addr + symtab[j].st_value);
+ }
+ }
+ }
+ }
+};
+
+struct PhdrInfo {
+ ElfW(Addr) vdso_addr;
+ ElfW(Dyn) * vdso_dyn;
+ static cpp::optional<PhdrInfo> from(ElfW(Phdr) * vdso_phdr, size_t e_phnum,
+ uintptr_t vdso_ehdr_addr) {
+ constexpr ElfW(Addr) INVALID_ADDR = static_cast<ElfW(Addr)>(-1);
+ ElfW(Addr) vdso_addr = INVALID_ADDR;
+ ElfW(Dyn) *vdso_dyn = nullptr;
+ if (!vdso_phdr)
+ return cpp::nullopt;
+ // iterate through all the program headers until we get the desired pieces
+ for (size_t i = 0; i < e_phnum; ++i) {
+ if (vdso_phdr[i].p_type == PT_DYNAMIC)
+ vdso_dyn = reinterpret_cast<ElfW(Dyn) *>(vdso_ehdr_addr +
+ vdso_phdr[i].p_offset);
+
+ if (vdso_phdr[i].p_type == PT_LOAD)
+ vdso_addr =
+ vdso_ehdr_addr + vdso_phdr[i].p_offset - vdso_phdr[i].p_vaddr;
+
+ if (vdso_addr && vdso_dyn)
+ return PhdrInfo{vdso_addr, vdso_dyn};
+ }
+
+ return cpp::nullopt;
+ }
+
+ cpp::optional<VDSOSymbolTable> populate_symbol_table() {
+ const char *strtab = nullptr;
+ ElfW(Sym) *symtab = nullptr;
+ ElfW(Half) *versym = nullptr;
+ Verdef *verdef = nullptr;
+ for (ElfW(Dyn) *d = vdso_dyn; d->d_tag != DT_NULL; ++d) {
+ switch (d->d_tag) {
+ case DT_STRTAB:
+ strtab = reinterpret_cast<const char *>(vdso_addr + d->d_un.d_ptr);
+ break;
+ case DT_SYMTAB:
+ symtab = reinterpret_cast<ElfW(Sym) *>(vdso_addr + d->d_un.d_ptr);
+ break;
+ case DT_VERSYM:
+ versym = reinterpret_cast<uint16_t *>(vdso_addr + d->d_un.d_ptr);
+ break;
+ case DT_VERDEF:
+ verdef = reinterpret_cast<Verdef *>(vdso_addr + d->d_un.d_ptr);
+ break;
+ }
+ if (strtab && symtab && versym && verdef)
+ break;
+ }
+ if (strtab == nullptr || symtab == nullptr)
+ return cpp::nullopt;
+
+ return VDSOSymbolTable{strtab, symtab, versym, verdef};
+ }
+};
+} // namespace
+
+void Symbol::initialize_vdso_global_cache() {
+ // first clear the symbol table
+ for (auto &i : global_cache)
+ i = nullptr;
+
+ // get the address of the VDSO, protect errno since getauxval may change
+ // it
+ int errno_backup = libc_errno;
+ uintptr_t vdso_ehdr_addr = getauxval(AT_SYSINFO_EHDR);
+ // Get the memory address of the vDSO ELF header.
+ auto vdso_ehdr = reinterpret_cast<ElfW(Ehdr) *>(vdso_ehdr_addr);
+ // leave the table unpopulated if we don't have vDSO
+ if (vdso_ehdr == nullptr) {
+ libc_errno = errno_backup;
+ return;
+ }
+
+ // locate the section header inside the elf using the section header
+ // offset
+ auto vdso_shdr =
+ reinterpret_cast<ElfW(Shdr) *>(vdso_ehdr_addr + vdso_ehdr->e_shoff);
+ size_t symbol_count = shdr_get_symbol_count(vdso_shdr, vdso_ehdr->e_shnum);
+
+ // early return if no symbol is found
+ if (symbol_count == 0)
+ return;
+
+ // We need to find both the loadable segment and the dynamic linking of
+ // the vDSO. compute vdso_phdr as the program header using the program
+ // header offset
+ ElfW(Phdr) *vdso_phdr =
+ reinterpret_cast<ElfW(Phdr) *>(vdso_ehdr_addr + vdso_ehdr->e_phoff);
+ cpp::optional<PhdrInfo> phdr_info =
+ PhdrInfo::from(vdso_phdr, vdso_ehdr->e_phnum, vdso_ehdr_addr);
+ // early return if either the dynamic linking or the loadable segment is
+ // not found
+ if (!phdr_info.has_value())
+ return;
+
+ // now, locate several more tables inside the dynmaic linking section
+ cpp::optional<VDSOSymbolTable> vdso_symbol_table =
+ phdr_info->populate_symbol_table();
+
+ // early return if we can't find any required fields of the symbol table
+ if (!vdso_symbol_table.has_value())
+ return;
+
+ // finally, populate the global symbol table cache
+ vdso_symbol_table->populate_symbol_cache(global_cache, symbol_count,
+ phdr_info->vdso_addr);
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/OSUtil/linux/vdso.h b/libc/src/__support/OSUtil/linux/vdso.h
new file mode 100644
index 0000000..a5108b3
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/vdso.h
@@ -0,0 +1,81 @@
+//===------------- Linux VDSO Header ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H
+#include "src/__support/CPP/array.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/threads/callonce.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_X86)
+#include "x86_64/vdso.h"
+#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#include "aarch64/vdso.h"
+#elif defined(LIBC_TARGET_ARCH_IS_ARM)
+#include "arm/vdso.h"
+#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
+#include "riscv/vdso.h"
+#else
+#error "unknown arch"
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+
+class Symbol {
+ VDSOSym sym;
+
+public:
+ LIBC_INLINE_VAR static constexpr size_t COUNT =
+ static_cast<size_t>(VDSOSym::VDSOSymCount);
+ LIBC_INLINE constexpr explicit Symbol(VDSOSym sym) : sym(sym) {}
+ LIBC_INLINE constexpr Symbol(size_t idx) : sym(static_cast<VDSOSym>(idx)) {}
+ LIBC_INLINE constexpr cpp::string_view name() const {
+ return symbol_name(sym);
+ }
+ LIBC_INLINE constexpr cpp::string_view version() const {
+ return symbol_version(sym);
+ }
+ LIBC_INLINE constexpr operator size_t() const {
+ return static_cast<size_t>(sym);
+ }
+ LIBC_INLINE constexpr bool is_valid() const {
+ return *this < Symbol::global_cache.size();
+ }
+ using VDSOArray = cpp::array<void *, Symbol::COUNT>;
+
+private:
+ static CallOnceFlag once_flag;
+ static VDSOArray global_cache;
+ static void initialize_vdso_global_cache();
+
+ LIBC_INLINE void *get() const {
+ if (name().empty() || !is_valid())
+ return nullptr;
+
+ callonce(&once_flag, Symbol::initialize_vdso_global_cache);
+ return (global_cache[*this]);
+ }
+ template <VDSOSym sym> friend struct TypedSymbol;
+};
+
+template <VDSOSym sym> struct TypedSymbol {
+ LIBC_INLINE constexpr operator VDSOSymType<sym>() const {
+ return cpp::bit_cast<VDSOSymType<sym>>(Symbol{sym}.get());
+ }
+ template <typename... Args>
+ LIBC_INLINE auto operator()(Args &&...args) const {
+ return this->operator VDSOSymType<sym>()(cpp::forward<Args>(args)...);
+ }
+};
+
+} // namespace vdso
+
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H
diff --git a/libc/src/__support/OSUtil/linux/vdso_sym.h b/libc/src/__support/OSUtil/linux/vdso_sym.h
new file mode 100644
index 0000000..eb5f204
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/vdso_sym.h
@@ -0,0 +1,70 @@
+//===------------- Linux VDSO Symbols ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "hdr/types/clock_t.h"
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_timespec.h"
+#include "hdr/types/struct_timeval.h"
+#include "hdr/types/time_t.h"
+#include "src/__support/common.h"
+#include <stddef.h> // For size_t.
+
+// NOLINTBEGIN(llvmlibc-implementation-in-namespace)
+// TODO: some of the following can be defined via proxy headers.
+struct __kernel_timespec;
+struct timezone;
+struct riscv_hwprobe;
+struct getcpu_cache;
+struct cpu_set_t;
+// NOLINTEND(llvmlibc-implementation-in-namespace)
+
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+
+enum class VDSOSym {
+ ClockGetTime,
+ ClockGetTime64,
+ GetTimeOfDay,
+ GetCpu,
+ Time,
+ ClockGetRes,
+ RTSigReturn,
+ FlushICache,
+ RiscvHwProbe,
+ VDSOSymCount
+};
+
+template <VDSOSym sym> LIBC_INLINE constexpr auto dispatcher() {
+ if constexpr (sym == VDSOSym::ClockGetTime)
+ return static_cast<int (*)(clockid_t, timespec *)>(nullptr);
+ else if constexpr (sym == VDSOSym::ClockGetTime64)
+ return static_cast<int (*)(clockid_t, __kernel_timespec *)>(nullptr);
+ else if constexpr (sym == VDSOSym::GetTimeOfDay)
+ return static_cast<int (*)(timeval *__restrict, timezone *__restrict)>(
+ nullptr);
+ else if constexpr (sym == VDSOSym::GetCpu)
+ return static_cast<int (*)(unsigned *, unsigned *, getcpu_cache *)>(
+ nullptr);
+ else if constexpr (sym == VDSOSym::Time)
+ return static_cast<time_t (*)(time_t *)>(nullptr);
+ else if constexpr (sym == VDSOSym::ClockGetRes)
+ return static_cast<int (*)(clockid_t, timespec *)>(nullptr);
+ else if constexpr (sym == VDSOSym::RTSigReturn)
+ return static_cast<void (*)(void)>(nullptr);
+ else if constexpr (sym == VDSOSym::FlushICache)
+ return static_cast<void (*)(void *, void *, unsigned int)>(nullptr);
+ else if constexpr (sym == VDSOSym::RiscvHwProbe)
+ return static_cast<int (*)(riscv_hwprobe *, size_t, size_t, cpu_set_t *,
+ unsigned)>(nullptr);
+ else
+ return static_cast<void *>(nullptr);
+}
+
+template <VDSOSym sym> using VDSOSymType = decltype(dispatcher<sym>());
+
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt
index a7f2d74e6..1324491 100644
--- a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt
@@ -5,3 +5,13 @@ add_header_library(
DEPENDS
libc.src.__support.common
)
+
+add_header_library(
+ vdso
+ HDRS
+ vdso.h
+ DEPENDS
+ libc.src.__support.common
+ libc.src.__support.CPP.string_view
+ libc.src.__support.OSUtil.linux.vdso_sym
+)
diff --git a/libc/src/__support/OSUtil/linux/x86_64/vdso.h b/libc/src/__support/OSUtil/linux/x86_64/vdso.h
new file mode 100644
index 0000000..abe7c33
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/x86_64/vdso.h
@@ -0,0 +1,43 @@
+//===---------- x86/x86_64 vdso configuration ---------------------* C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/OSUtil/linux/vdso_sym.h"
+namespace LIBC_NAMESPACE_DECL {
+namespace vdso {
+// translate VDSOSym to symbol names
+// On x86, there are symbols defined without the __vdso_ prefix, however,
+// it is suggested that one should use the __vdso_ prefix.
+// Additionally, there is also an __vdso_sgx_enter_enclave, it is for the SGX
+// support, we do not include it here for now.
+// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/entry/vdso/vdso.lds.S
+LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) {
+ switch (sym) {
+ case VDSOSym::ClockGetTime:
+ return "__vdso_clock_gettime";
+ case VDSOSym::GetTimeOfDay:
+ return "__vdso_gettimeofday";
+ case VDSOSym::GetCpu:
+ return "__vdso_getcpu";
+ case VDSOSym::Time:
+ return "__vdso_time";
+ case VDSOSym::ClockGetRes:
+ return "__vdso_clock_getres";
+ default:
+ return "";
+ }
+}
+
+// symbol versions
+LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) {
+ return "LINUX_2.6";
+}
+} // namespace vdso
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H
diff --git a/libc/src/__support/macros/CMakeLists.txt b/libc/src/__support/macros/CMakeLists.txt
index bcd47c3..99d4f64 100644
--- a/libc/src/__support/macros/CMakeLists.txt
+++ b/libc/src/__support/macros/CMakeLists.txt
@@ -27,3 +27,13 @@ add_header_library(
DEPENDS
libc.src.__support.macros.properties.compiler
)
+
+add_header_library(
+ null_check
+ HDRS
+ null_check.h
+ DEPENDS
+ .config
+ .optimization
+ .sanitizer
+)
diff --git a/libc/src/__support/macros/null_check.h b/libc/src/__support/macros/null_check.h
new file mode 100644
index 0000000..400f7d8
--- /dev/null
+++ b/libc/src/__support/macros/null_check.h
@@ -0,0 +1,33 @@
+//===-- Safe nullptr check --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H
+#define LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/sanitizer.h"
+
+#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER)
+// Use volatile to prevent undefined behavior of dereferencing nullptr.
+// Intentionally crashing with SIGSEGV.
+#define LIBC_CRASH_ON_NULLPTR(PTR) \
+ do { \
+ if (LIBC_UNLIKELY(PTR == nullptr)) { \
+ volatile auto *crashing = PTR; \
+ [[maybe_unused]] volatile auto crash = *crashing; \
+ __builtin_trap(); \
+ } \
+ } while (0)
+#else
+#define LIBC_CRASH_ON_NULLPTR(ptr) \
+ do { \
+ } while (0)
+#endif
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H
diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h
index c4f8b5b..c20412e 100644
--- a/libc/src/__support/macros/sanitizer.h
+++ b/libc/src/__support/macros/sanitizer.h
@@ -15,7 +15,25 @@
// Functions to unpoison memory
//-----------------------------------------------------------------------------
+#if LIBC_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define LIBC_HAS_ADDRESS_SANITIZER
+#endif
+
#if LIBC_HAS_FEATURE(memory_sanitizer)
+#define LIBC_HAS_MEMORY_SANITIZER
+#endif
+
+#if LIBC_HAS_FEATURE(undefined_behavior_sanitizer)
+#define LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER
+#endif
+
+#if defined(LIBC_HAS_ADDRESS_SANITIZER) || \
+ defined(LIBC_HAS_MEMORY_SANITIZER) || \
+ defined(LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER)
+#define LIBC_HAS_SANITIZER
+#endif
+
+#ifdef LIBC_HAS_MEMORY_SANITIZER
// Only perform MSAN unpoison in non-constexpr context.
#include <sanitizer/msan_interface.h>
#define MSAN_UNPOISON(addr, size) \
@@ -27,8 +45,7 @@
#define MSAN_UNPOISON(ptr, size)
#endif
-#if LIBC_HAS_FEATURE(address_sanitizer)
-#define LIBC_HAVE_ADDRESS_SANITIZER
+#ifdef LIBC_HAS_ADDRESS_SANITIZER
#include <sanitizer/asan_interface.h>
#define ASAN_POISON_MEMORY_REGION(addr, size) \
__asan_poison_memory_region((addr), (size))
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index ffd6ebf..a452b3a 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -20,6 +20,8 @@
#include "src/__support/detailed_powers_of_ten.h"
#include "src/__support/high_precision_decimal.h"
#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+#include "src/__support/macros/optimization.h"
#include "src/__support/str_to_integer.h"
#include "src/__support/str_to_num_result.h"
#include "src/__support/uint128.h"
@@ -1208,6 +1210,8 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
using FPBits = typename fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
+ LIBC_CRASH_ON_NULLPTR(arg);
+
FPBits result;
int error = 0;
StorageType nan_mantissa = 0;
diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp
index c8a15dd..799aad1 100644
--- a/libc/src/stdlib/atexit.cpp
+++ b/libc/src/stdlib/atexit.cpp
@@ -16,6 +16,7 @@ namespace LIBC_NAMESPACE_DECL {
constinit ExitCallbackList atexit_callbacks;
Mutex handler_list_mtx(false, false, false, false);
+[[gnu::weak]] extern void teardown_main_tls();
extern "C" {
@@ -24,8 +25,11 @@ int __cxa_atexit(AtExitCallback *callback, void *payload, void *) {
}
void __cxa_finalize(void *dso) {
- if (!dso)
+ if (!dso) {
call_exit_callbacks(atexit_callbacks);
+ if (teardown_main_tls)
+ teardown_main_tls();
+ }
}
} // extern "C"
diff --git a/libc/src/stdlib/quick_exit.cpp b/libc/src/stdlib/quick_exit.cpp
index a5abf3e..29110b3 100644
--- a/libc/src/stdlib/quick_exit.cpp
+++ b/libc/src/stdlib/quick_exit.cpp
@@ -16,9 +16,12 @@
namespace LIBC_NAMESPACE_DECL {
extern ExitCallbackList at_quick_exit_callbacks;
+[[gnu::weak]] extern void teardown_main_tls();
[[noreturn]] LLVM_LIBC_FUNCTION(void, quick_exit, (int status)) {
call_exit_callbacks(at_quick_exit_callbacks);
+ if (teardown_main_tls)
+ teardown_main_tls();
internal::exit(status);
}
diff --git a/libc/src/sys/auxv/getauxval.h b/libc/src/sys/auxv/getauxval.h
index 3e69713..d9da45ff 100644
--- a/libc/src/sys/auxv/getauxval.h
+++ b/libc/src/sys/auxv/getauxval.h
@@ -9,8 +9,8 @@
#ifndef LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H
#define LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H
+#include "hdr/sys_auxv_macros.h"
#include "src/__support/macros/config.h"
-#include <sys/auxv.h>
namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index f09541b..8bd0c3a 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -17,6 +17,9 @@ extern "C" int main(int argc, char **argv, char **envp);
namespace LIBC_NAMESPACE_DECL {
+// FIXME: Factor this out into common logic so we don't need to stub it here.
+void teardown_main_tls() {}
+
DataEnvironment app;
extern "C" uintptr_t __init_array_start[];
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index ef1e63e..bc529b3 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -19,6 +19,9 @@ namespace LIBC_NAMESPACE_DECL {
DataEnvironment app;
+// FIXME: Factor this out into common logic so we don't need to stub it here.
+void teardown_main_tls() {}
+
extern "C" {
// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
// to manually create them and update the globals in the loader implememtation.
diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
index 72060b4..ff104c7 100644
--- a/libc/startup/linux/do_start.cpp
+++ b/libc/startup/linux/do_start.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
#include "startup/linux/do_start.h"
+#include "config/linux/app.h"
#include "include/llvm-libc-macros/link-macros.h"
#include "src/__support/OSUtil/syscall.h"
#include "src/__support/macros/config.h"
@@ -60,6 +61,10 @@ static void call_fini_array_callbacks() {
}
static ThreadAttributes main_thread_attrib;
+static TLSDescriptor tls;
+// We separate teardown_main_tls from callbacks as callback function themselves
+// may require TLS.
+void teardown_main_tls() { cleanup_tls(tls.addr, tls.size); }
[[noreturn]] void do_start() {
auto tid = syscall_impl<long>(SYS_gettid);
@@ -122,7 +127,6 @@ static ThreadAttributes main_thread_attrib;
// This descriptor has to be static since its cleanup function cannot
// capture the context.
- static TLSDescriptor tls;
init_tls(tls);
if (tls.size != 0 && !set_thread_ptr(tls.tp))
syscall_impl<long>(SYS_exit, 1);
@@ -130,10 +134,7 @@ static ThreadAttributes main_thread_attrib;
self.attrib = &main_thread_attrib;
main_thread_attrib.atexit_callback_mgr =
internal::get_thread_atexit_callback_mgr();
- // We register the cleanup_tls function to be the last atexit callback to be
- // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such
- // as the stack protector canary).
- atexit([]() { cleanup_tls(tls.addr, tls.size); });
+
// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
// array callbacks as they can potentially register their own atexit
diff --git a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt
index bfb072c..ff82616 100644
--- a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt
+++ b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt
@@ -1,3 +1,21 @@
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
add_subdirectory(${LIBC_TARGET_ARCHITECTURE})
endif()
+
+add_libc_test(
+ vdso_test
+ SUITE libc-osutil-tests
+ SRCS vdso_test.cpp
+ DEPENDS
+ libc.src.__support.OSUtil.linux.vdso
+ libc.src.__support.OSUtil.osutil
+ libc.hdr.types.struct_sigaction
+ libc.hdr.types.struct_timeval
+ libc.hdr.types.struct_timespec
+ libc.hdr.types.clockid_t
+ libc.hdr.types.time_t
+ libc.hdr.time_macros
+ libc.hdr.signal_macros
+ libc.src.signal.sigaction
+ libc.src.signal.raise
+)
diff --git a/libc/test/src/__support/OSUtil/linux/vdso_test.cpp b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp
new file mode 100644
index 0000000..2363db6
--- /dev/null
+++ b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp
@@ -0,0 +1,162 @@
+//===-- Unittests for VDSO ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/signal_macros.h"
+#include "hdr/time_macros.h"
+#include "hdr/types/clockid_t.h"
+#include "hdr/types/struct_sigaction.h"
+#include "hdr/types/struct_timespec.h"
+#include "hdr/types/struct_timeval.h"
+#include "hdr/types/time_t.h"
+#include "src/__support/OSUtil/linux/vdso.h"
+#include "src/__support/OSUtil/syscall.h"
+#include "src/__support/macros/properties/architectures.h"
+#include "src/signal/raise.h"
+#include "src/signal/sigaction.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/LibcTest.h"
+#include "test/UnitTest/Test.h"
+#include <linux/time_types.h>
+#include <sys/syscall.h>
+
+struct riscv_hwprobe {
+ int64_t key;
+ uint64_t value;
+};
+
+namespace LIBC_NAMESPACE_DECL {
+// For x86_64, we explicitly test some traditional vdso symbols are indeed
+// available.
+
+TEST(LlvmLibcOSUtilVDSOTest, GetTimeOfDay) {
+ vdso::TypedSymbol<vdso::VDSOSym::GetTimeOfDay> symbol;
+ if (!symbol)
+ return;
+ timeval tv;
+ EXPECT_EQ(symbol(&tv, nullptr), 0);
+ // hopefully people are not building time machines using our libc.
+ EXPECT_GT(tv.tv_sec, static_cast<decltype(tv.tv_sec)>(0));
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, Time) {
+ vdso::TypedSymbol<vdso::VDSOSym::Time> symbol;
+ if (!symbol)
+ return;
+ time_t a, b;
+ EXPECT_GT(symbol(&a), static_cast<time_t>(0));
+ EXPECT_GT(symbol(&b), static_cast<time_t>(0));
+ EXPECT_GE(b, a);
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime) {
+ vdso::TypedSymbol<vdso::VDSOSym::ClockGetTime> symbol;
+ if (!symbol)
+ return;
+ timespec a, b;
+ EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0);
+ EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0);
+ if (a.tv_sec == b.tv_sec) {
+ EXPECT_LT(a.tv_nsec, b.tv_nsec);
+ } else {
+ EXPECT_LT(a.tv_sec, b.tv_sec);
+ }
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime64) {
+ vdso::TypedSymbol<vdso::VDSOSym::ClockGetTime64> symbol;
+ if (!symbol)
+ return;
+ // See kernel API at
+ // https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/vDSO/vdso_test_correctness.c#L155
+ __kernel_timespec a, b;
+ EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0);
+ EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0);
+ if (a.tv_sec == b.tv_sec) {
+ EXPECT_LT(a.tv_nsec, b.tv_nsec);
+ } else {
+ EXPECT_LT(a.tv_sec, b.tv_sec);
+ }
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, ClockGetRes) {
+ vdso::TypedSymbol<vdso::VDSOSym::ClockGetRes> symbol;
+ if (!symbol)
+ return;
+ timespec res{};
+ EXPECT_EQ(symbol(CLOCK_MONOTONIC, &res), 0);
+ EXPECT_TRUE(res.tv_sec > 0 || res.tv_nsec > 0);
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, GetCpu) {
+ // The kernel system call has a third argument, which should be passed as
+ // nullptr.
+ vdso::TypedSymbol<vdso::VDSOSym::GetCpu> symbol;
+ if (!symbol)
+ return;
+ unsigned cpu = static_cast<unsigned>(-1), node = static_cast<unsigned>(-1);
+ EXPECT_EQ(symbol(&cpu, &node, nullptr), 0);
+ EXPECT_GE(cpu, 0u);
+ EXPECT_GE(node, 0u);
+}
+
+static bool flag = false;
+static void sigprof_handler [[gnu::used]] (int) { flag = true; }
+
+TEST(LlvmLibcOSUtilVDSOTest, RtSigReturn) {
+ using namespace testing::ErrnoSetterMatcher;
+ // must use struct since there is a function of the same name in the same
+ // scope.
+ struct sigaction sa {};
+ struct sigaction old_sa {};
+ sa.sa_handler = sigprof_handler;
+ sa.sa_flags = SA_RESTORER;
+ vdso::TypedSymbol<vdso::VDSOSym::RTSigReturn> symbol;
+ if (!symbol)
+ return;
+ sa.sa_restorer = symbol;
+ ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &sa, &old_sa), Succeeds());
+ raise(SIGPROF);
+ ASSERT_TRUE(flag);
+ flag = false;
+ ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &old_sa, nullptr), Succeeds());
+}
+
+TEST(LlvmLibcOSUtilVDSOTest, FlushICache) {
+ vdso::TypedSymbol<vdso::VDSOSym::FlushICache> symbol;
+ if (!symbol)
+ return;
+ char buf[512];
+ // we just check that the flush will not panic the program.
+ // the flags part only take 0/1 as up to kernel 6.10, which is used to
+ // indicate whether the flush is local to the core or global.
+ symbol(buf, buf + sizeof(buf), 0);
+ symbol(buf, buf + sizeof(buf), 1);
+}
+
+// https://docs.kernel.org/6.5/riscv/hwprobe.html
+TEST(LlvmLibcOSUtilVDSOTest, RiscvHwProbe) {
+ using namespace testing::ErrnoSetterMatcher;
+ vdso::TypedSymbol<vdso::VDSOSym::RiscvHwProbe> symbol;
+ if (!symbol)
+ return;
+ // If a key is unknown to the kernel, its key field will be cleared to -1, and
+ // its value set to 0. We expect probes.value are all 0.
+ // Usermode can supply NULL for cpus and 0 for cpu_count as a shortcut for all
+ // online CPUs
+ riscv_hwprobe probes[2] = {{-1, 1}, {-1, 1}};
+ ASSERT_THAT(symbol(/*pairs=*/probes, /*count=*/2, /*cpusetsize=*/0,
+ /*cpuset=*/nullptr,
+ /*flags=*/0),
+ Succeeds());
+ for (auto &probe : probes) {
+ EXPECT_EQ(probe.key, static_cast<decltype(probe.key)>(-1));
+ EXPECT_EQ(probe.value, static_cast<decltype(probe.value)>(0));
+ }
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/compiler/CMakeLists.txt b/libc/test/src/compiler/CMakeLists.txt
index 65a9acc..a45fa8c 100644
--- a/libc/test/src/compiler/CMakeLists.txt
+++ b/libc/test/src/compiler/CMakeLists.txt
@@ -7,6 +7,7 @@ add_libc_unittest(
SRCS
stack_chk_guard_test.cpp
DEPENDS
+ libc.hdr.signal_macros
libc.src.__support.macros.sanitizer
libc.src.compiler.__stack_chk_fail
libc.src.string.memset
diff --git a/libc/test/src/compiler/stack_chk_guard_test.cpp b/libc/test/src/compiler/stack_chk_guard_test.cpp
index 6b71e15..4ec8398 100644
--- a/libc/test/src/compiler/stack_chk_guard_test.cpp
+++ b/libc/test/src/compiler/stack_chk_guard_test.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
-#include "include/llvm-libc-macros/signal-macros.h"
+#include "hdr/signal_macros.h"
#include "src/__support/macros/sanitizer.h"
#include "src/compiler/__stack_chk_fail.h"
#include "src/string/memset.h"
@@ -18,7 +18,7 @@ TEST(LlvmLibcStackChkFail, Death) {
// Disable the test when asan is enabled so that it doesn't immediately fail
// after the memset, but before the stack canary is re-checked.
-#ifndef LIBC_HAVE_ADDRESS_SANITIZER
+#ifndef LIBC_HAS_ADDRESS_SANITIZER
TEST(LlvmLibcStackChkFail, Smash) {
EXPECT_DEATH(
[] {
@@ -27,4 +27,4 @@ TEST(LlvmLibcStackChkFail, Smash) {
},
WITH_SIGNAL(SIGABRT));
}
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 7271e93..e943d98 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -2895,9 +2895,10 @@ add_fp_unittest(
SRCS
nanf_test.cpp
DEPENDS
- libc.include.signal
+ libc.hdr.signal_macros
libc.src.math.nanf
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.macros.sanitizer
# FIXME: The nan tests currently have death tests, which aren't supported for
# hermetic tests.
UNIT_TEST_ONLY
@@ -2910,9 +2911,10 @@ add_fp_unittest(
SRCS
nan_test.cpp
DEPENDS
- libc.include.signal
+ libc.hdr.signal_macros
libc.src.math.nan
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.macros.sanitizer
# FIXME: The nan tests currently have death tests, which aren't supported for
# hermetic tests.
UNIT_TEST_ONLY
@@ -2925,9 +2927,10 @@ add_fp_unittest(
SRCS
nanl_test.cpp
DEPENDS
- libc.include.signal
+ libc.hdr.signal_macros
libc.src.math.nanl
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.macros.sanitizer
# FIXME: The nan tests currently have death tests, which aren't supported for
# hermetic tests.
UNIT_TEST_ONLY
@@ -2940,7 +2943,7 @@ add_fp_unittest(
SRCS
nanf16_test.cpp
DEPENDS
- libc.include.signal
+ libc.hdr.signal_macros
libc.src.math.nanf16
libc.src.__support.FPUtil.fp_bits
libc.src.__support.macros.sanitizer
@@ -2956,9 +2959,10 @@ add_fp_unittest(
SRCS
nanf128_test.cpp
DEPENDS
- libc.include.signal
+ libc.hdr.signal_macros
libc.src.math.nanf128
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.macros.sanitizer
# FIXME: The nan tests currently have death tests, which aren't supported for
# hermetic tests.
UNIT_TEST_ONLY
diff --git a/libc/test/src/math/smoke/nan_test.cpp b/libc/test/src/math/smoke/nan_test.cpp
index 68c8441..46b9e9a 100644
--- a/libc/test/src/math/smoke/nan_test.cpp
+++ b/libc/test/src/math/smoke/nan_test.cpp
@@ -6,12 +6,13 @@
//
//===----------------------------------------------------------------------===//
+#include "hdr/signal_macros.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
#include "src/math/nan.h"
#include "test/UnitTest/FEnvSafeTest.h"
#include "test/UnitTest/FPMatcher.h"
#include "test/UnitTest/Test.h"
-#include <signal.h>
class LlvmLibcNanTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
public:
@@ -43,8 +44,8 @@ TEST_F(LlvmLibcNanTest, RandomString) {
run_test("123 ", 0x7ff8000000000000);
}
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
TEST_F(LlvmLibcNanTest, InvalidInput) {
EXPECT_DEATH([] { LIBC_NAMESPACE::nan(nullptr); }, WITH_SIGNAL(SIGSEGV));
}
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp
index 015cc31..25dd2ef 100644
--- a/libc/test/src/math/smoke/nanf128_test.cpp
+++ b/libc/test/src/math/smoke/nanf128_test.cpp
@@ -6,7 +6,9 @@
//
//===----------------------------------------------------------------------===//
+#include "hdr/signal_macros.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
#include "src/__support/uint128.h"
#include "src/math/nanf128.h"
#include "test/UnitTest/FEnvSafeTest.h"
@@ -53,9 +55,8 @@ TEST_F(LlvmLibcNanf128Test, RandomString) {
QUIET_NAN);
}
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
-#include <signal.h>
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
TEST_F(LlvmLibcNanf128Test, InvalidInput) {
EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV));
}
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanf16_test.cpp b/libc/test/src/math/smoke/nanf16_test.cpp
index 81b844b..ec640a3 100644
--- a/libc/test/src/math/smoke/nanf16_test.cpp
+++ b/libc/test/src/math/smoke/nanf16_test.cpp
@@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//
+#include "hdr/signal_macros.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/sanitizer.h"
#include "src/math/nanf16.h"
@@ -13,8 +14,6 @@
#include "test/UnitTest/FPMatcher.h"
#include "test/UnitTest/Test.h"
-#include <signal.h>
-
class LlvmLibcNanf16Test : public LIBC_NAMESPACE::testing::FEnvSafeTest {
public:
using StorageType = LIBC_NAMESPACE::fputil::FPBits<float16>::StorageType;
@@ -44,8 +43,8 @@ TEST_F(LlvmLibcNanf16Test, RandomString) {
run_test("123 ", 0x7e00);
}
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
TEST_F(LlvmLibcNanf16Test, InvalidInput) {
EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }, WITH_SIGNAL(SIGSEGV));
}
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanf_test.cpp b/libc/test/src/math/smoke/nanf_test.cpp
index ff58236..dd3124e 100644
--- a/libc/test/src/math/smoke/nanf_test.cpp
+++ b/libc/test/src/math/smoke/nanf_test.cpp
@@ -6,12 +6,13 @@
//
//===----------------------------------------------------------------------===//
+#include "hdr/signal_macros.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
#include "src/math/nanf.h"
#include "test/UnitTest/FEnvSafeTest.h"
#include "test/UnitTest/FPMatcher.h"
#include "test/UnitTest/Test.h"
-#include <signal.h>
class LlvmLibcNanfTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
public:
@@ -42,8 +43,8 @@ TEST_F(LlvmLibcNanfTest, RandomString) {
run_test("123 ", 0x7fc00000);
}
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
TEST_F(LlvmLibcNanfTest, InvalidInput) {
EXPECT_DEATH([] { LIBC_NAMESPACE::nanf(nullptr); }, WITH_SIGNAL(SIGSEGV));
}
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libc/test/src/math/smoke/nanl_test.cpp b/libc/test/src/math/smoke/nanl_test.cpp
index de9af05..ef3f9c1 100644
--- a/libc/test/src/math/smoke/nanl_test.cpp
+++ b/libc/test/src/math/smoke/nanl_test.cpp
@@ -6,12 +6,13 @@
//
//===----------------------------------------------------------------------===//
+#include "hdr/signal_macros.h"
#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/sanitizer.h"
#include "src/math/nanl.h"
#include "test/UnitTest/FEnvSafeTest.h"
#include "test/UnitTest/FPMatcher.h"
#include "test/UnitTest/Test.h"
-#include <signal.h>
#if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
#define SELECT_LONG_DOUBLE(val, _, __) val
@@ -70,8 +71,8 @@ TEST_F(LlvmLibcNanlTest, RandomString) {
run_test("123 ", expected);
}
-#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
+#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX)
TEST_F(LlvmLibcNanlTest, InvalidInput) {
EXPECT_DEATH([] { LIBC_NAMESPACE::nanl(nullptr); }, WITH_SIGNAL(SIGSEGV));
}
-#endif // LIBC_HAVE_ADDRESS_SANITIZER
+#endif // LIBC_HAS_ADDRESS_SANITIZER
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a571832..ffff811 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -925,7 +925,6 @@ set(files
exception
execution
expected
- experimental/__config
experimental/__simd/aligned_tag.h
experimental/__simd/declaration.h
experimental/__simd/reference.h
diff --git a/libcxx/include/__config b/libcxx/include/__config
index b0a5dda..f0a9243 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -591,6 +591,15 @@ typedef __char32_t char32_t;
inline namespace _LIBCPP_ABI_NAMESPACE {
# define _LIBCPP_END_NAMESPACE_STD }} _LIBCPP_POP_EXTENSION_DIAGNOSTICS
+#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL namespace std { namespace experimental {
+#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL }}
+
+#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 {
+#define _LIBCPP_END_NAMESPACE_LFTS } _LIBCPP_END_NAMESPACE_EXPERIMENTAL
+
+#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 {
+#define _LIBCPP_END_NAMESPACE_LFTS_V2 } _LIBCPP_END_NAMESPACE_EXPERIMENTAL
+
#ifdef _LIBCPP_ABI_NO_FILESYSTEM_INLINE_NAMESPACE
# define _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_STD namespace filesystem {
# define _LIBCPP_END_NAMESPACE_FILESYSTEM } _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h
index 86d9f28..5980b07 100644
--- a/libcxx/include/__pstl/backend.h
+++ b/libcxx/include/__pstl/backend.h
@@ -19,16 +19,20 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
-#if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
-# include <__pstl/backends/default.h>
-# include <__pstl/backends/serial.h>
-#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
-# include <__pstl/backends/default.h>
-# include <__pstl/backends/std_thread.h>
-#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
-# include <__pstl/backends/default.h>
-# include <__pstl/backends/libdispatch.h>
-#endif
+#if _LIBCPP_STD_VER >= 17
+
+# if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
+# include <__pstl/backends/default.h>
+# include <__pstl/backends/serial.h>
+# elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
+# include <__pstl/backends/default.h>
+# include <__pstl/backends/std_thread.h>
+# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+# include <__pstl/backends/default.h>
+# include <__pstl/backends/libdispatch.h>
+# endif
+
+#endif // _LIBCPP_STD_VER >= 17
_LIBCPP_POP_MACROS
diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h
index 32c5da5..2132e8d 100644
--- a/libcxx/include/__pstl/backend_fwd.h
+++ b/libcxx/include/__pstl/backend_fwd.h
@@ -39,6 +39,8 @@ _LIBCPP_PUSH_MACROS
// the user.
//
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -50,18 +52,18 @@ struct __libdispatch_backend_tag;
struct __serial_backend_tag;
struct __std_thread_backend_tag;
-#if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
+# if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
using __current_configuration = __backend_configuration<__serial_backend_tag, __default_backend_tag>;
-#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
+# elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>;
-#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
-#else
+# else
// ...New vendors can add parallel backends here...
-# error "Invalid PSTL backend configuration"
-#endif
+# error "Invalid PSTL backend configuration"
+# endif
template <class _Backend, class _ExecutionPolicy>
struct __find_if;
@@ -296,6 +298,8 @@ struct __reduce;
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_BACKEND_FWD_H
diff --git a/libcxx/include/__pstl/backends/default.h b/libcxx/include/__pstl/backends/default.h
index b655da5..3672bbf 100644
--- a/libcxx/include/__pstl/backends/default.h
+++ b/libcxx/include/__pstl/backends/default.h
@@ -33,6 +33,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -498,6 +500,8 @@ struct __rotate_copy<__default_backend_tag, _ExecutionPolicy> {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_BACKENDS_DEFAULT_H
diff --git a/libcxx/include/__pstl/backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h
index a0c3ad9..2d6ab49 100644
--- a/libcxx/include/__pstl/backends/libdispatch.h
+++ b/libcxx/include/__pstl/backends/libdispatch.h
@@ -44,6 +44,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -140,15 +142,15 @@ struct __cpu_traits<__libdispatch_backend_tag> {
unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges(
[&]() -> __merge_range_t* {
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+# ifndef _LIBCPP_HAS_NO_EXCEPTIONS
try {
-#endif
+# endif
return std::allocator<__merge_range_t>().allocate(__n_ranges);
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+# ifndef _LIBCPP_HAS_NO_EXCEPTIONS
} catch (const std::bad_alloc&) {
return nullptr;
}
-#endif
+# endif
}(),
__destroy);
@@ -392,6 +394,8 @@ struct __fill<__libdispatch_backend_tag, _ExecutionPolicy>
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
diff --git a/libcxx/include/__pstl/backends/serial.h b/libcxx/include/__pstl/backends/serial.h
index 5f24499..f414201 100644
--- a/libcxx/include/__pstl/backends/serial.h
+++ b/libcxx/include/__pstl/backends/serial.h
@@ -30,6 +30,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -176,6 +178,8 @@ struct __transform_reduce_binary<__serial_backend_tag, _ExecutionPolicy> {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_BACKENDS_SERIAL_H
diff --git a/libcxx/include/__pstl/backends/std_thread.h b/libcxx/include/__pstl/backends/std_thread.h
index 49570bd..19b985f 100644
--- a/libcxx/include/__pstl/backends/std_thread.h
+++ b/libcxx/include/__pstl/backends/std_thread.h
@@ -32,6 +32,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -131,6 +133,8 @@ struct __fill<__std_thread_backend_tag, _ExecutionPolicy>
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_BACKENDS_STD_THREAD_H
diff --git a/libcxx/include/__pstl/cpu_algos/any_of.h b/libcxx/include/__pstl/cpu_algos/any_of.h
index b33c787..803db79 100644
--- a/libcxx/include/__pstl/cpu_algos/any_of.h
+++ b/libcxx/include/__pstl/cpu_algos/any_of.h
@@ -26,6 +26,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -94,6 +96,8 @@ struct __cpu_parallel_any_of {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_CPU_ALGOS_ANY_OF_H
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
index 0483d691..5e59752 100644
--- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -19,6 +19,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -81,6 +83,8 @@ struct __cpu_traits;
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H
diff --git a/libcxx/include/__pstl/cpu_algos/fill.h b/libcxx/include/__pstl/cpu_algos/fill.h
index 4e6d29b..3e59365 100644
--- a/libcxx/include/__pstl/cpu_algos/fill.h
+++ b/libcxx/include/__pstl/cpu_algos/fill.h
@@ -23,6 +23,8 @@
# pragma GCC system_header
#endif
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -63,4 +65,6 @@ struct __cpu_parallel_fill {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
#endif // _LIBCPP___PSTL_CPU_ALGOS_FILL_H
diff --git a/libcxx/include/__pstl/cpu_algos/find_if.h b/libcxx/include/__pstl/cpu_algos/find_if.h
index 12b2e88..cd92e5a 100644
--- a/libcxx/include/__pstl/cpu_algos/find_if.h
+++ b/libcxx/include/__pstl/cpu_algos/find_if.h
@@ -31,6 +31,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -132,6 +134,8 @@ struct __cpu_parallel_find_if {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_CPU_ALGOS_FIND_IF_H
diff --git a/libcxx/include/__pstl/cpu_algos/for_each.h b/libcxx/include/__pstl/cpu_algos/for_each.h
index d4d7862..cec719b 100644
--- a/libcxx/include/__pstl/cpu_algos/for_each.h
+++ b/libcxx/include/__pstl/cpu_algos/for_each.h
@@ -23,6 +23,8 @@
# pragma GCC system_header
#endif
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -63,4 +65,6 @@ struct __cpu_parallel_for_each {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
#endif // _LIBCPP___PSTL_CPU_ALGOS_FOR_EACH_H
diff --git a/libcxx/include/__pstl/cpu_algos/merge.h b/libcxx/include/__pstl/cpu_algos/merge.h
index dfa4cbf..a9069ca 100644
--- a/libcxx/include/__pstl/cpu_algos/merge.h
+++ b/libcxx/include/__pstl/cpu_algos/merge.h
@@ -26,6 +26,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -80,6 +82,8 @@ struct __cpu_parallel_merge {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_CPU_ALGOS_MERGE_H
diff --git a/libcxx/include/__pstl/cpu_algos/stable_sort.h b/libcxx/include/__pstl/cpu_algos/stable_sort.h
index 8e64f3e..5afdd3f 100644
--- a/libcxx/include/__pstl/cpu_algos/stable_sort.h
+++ b/libcxx/include/__pstl/cpu_algos/stable_sort.h
@@ -21,6 +21,8 @@
# pragma GCC system_header
#endif
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -44,4 +46,6 @@ struct __cpu_parallel_stable_sort {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
#endif // _LIBCPP___PSTL_CPU_ALGOS_STABLE_SORT_H
diff --git a/libcxx/include/__pstl/cpu_algos/transform.h b/libcxx/include/__pstl/cpu_algos/transform.h
index 27ce8e2..979121b 100644
--- a/libcxx/include/__pstl/cpu_algos/transform.h
+++ b/libcxx/include/__pstl/cpu_algos/transform.h
@@ -27,6 +27,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -148,6 +150,8 @@ struct __cpu_parallel_transform_binary {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_H
diff --git a/libcxx/include/__pstl/cpu_algos/transform_reduce.h b/libcxx/include/__pstl/cpu_algos/transform_reduce.h
index 36ac1a9..aafbf1c 100644
--- a/libcxx/include/__pstl/cpu_algos/transform_reduce.h
+++ b/libcxx/include/__pstl/cpu_algos/transform_reduce.h
@@ -31,6 +31,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -211,6 +213,8 @@ struct __cpu_parallel_transform_reduce {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_REDUCE_H
diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h
index 5e903f7..ea40fa7 100644
--- a/libcxx/include/__pstl/dispatch.h
+++ b/libcxx/include/__pstl/dispatch.h
@@ -23,6 +23,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -61,6 +63,8 @@ using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfigu
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_DISPATCH_H
diff --git a/libcxx/include/__pstl/handle_exception.h b/libcxx/include/__pstl/handle_exception.h
index d627095..57dfcfd 100644
--- a/libcxx/include/__pstl/handle_exception.h
+++ b/libcxx/include/__pstl/handle_exception.h
@@ -22,6 +22,8 @@
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
+#if _LIBCPP_STD_VER >= 17
+
_LIBCPP_BEGIN_NAMESPACE_STD
namespace __pstl {
@@ -52,6 +54,8 @@ _LIBCPP_HIDE_FROM_ABI auto __handle_exception(_Args&&... __args) {
} // namespace __pstl
_LIBCPP_END_NAMESPACE_STD
+#endif // _LIBCPP_STD_VER >= 17
+
_LIBCPP_POP_MACROS
#endif // _LIBCPP___PSTL_HANDLE_EXCEPTION_H
diff --git a/libcxx/include/experimental/__config b/libcxx/include/experimental/__config
deleted file mode 100644
index 7b23791..0000000
--- a/libcxx/include/experimental/__config
+++ /dev/null
@@ -1,45 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL_CONFIG
-#define _LIBCPP_EXPERIMENTAL_CONFIG
-
-#include <__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-# pragma GCC system_header
-#endif
-
-#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL \
- namespace std { \
- namespace experimental {
-#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL \
- } \
- }
-
-#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 {
-#define _LIBCPP_END_NAMESPACE_LFTS \
- } \
- } \
- }
-
-#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 {
-#define _LIBCPP_END_NAMESPACE_LFTS_V2 \
- } \
- } \
- }
-
-// TODO: support more targets
-#if defined(__AVX__)
-# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32
-#else
-# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16
-#endif
-
-#endif
diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h
index 31d2b50..e364e14 100644
--- a/libcxx/include/experimental/__simd/aligned_tag.h
+++ b/libcxx/include/experimental/__simd/aligned_tag.h
@@ -10,10 +10,10 @@
#ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
#define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
+#include <__config>
#include <__memory/assume_aligned.h>
#include <__type_traits/remove_const.h>
#include <cstddef>
-#include <experimental/__config>
#include <experimental/__simd/traits.h>
#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h
index 7b45d03..2ac7224 100644
--- a/libcxx/include/experimental/__simd/declaration.h
+++ b/libcxx/include/experimental/__simd/declaration.h
@@ -10,11 +10,18 @@
#ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
#define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
+#include <__config>
#include <cstddef>
-#include <experimental/__config>
#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
+// TODO: support more targets
+# if defined(__AVX__)
+# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32
+# else
+# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16
+# endif
+
_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
inline namespace parallelism_v2 {
namespace simd_abi {
diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h
index c60c08b..cba460b 100644
--- a/libcxx/include/experimental/__simd/reference.h
+++ b/libcxx/include/experimental/__simd/reference.h
@@ -10,6 +10,7 @@
#ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
#define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
+#include <__config>
#include <__type_traits/enable_if.h>
#include <__type_traits/is_assignable.h>
#include <__type_traits/is_same.h>
@@ -17,7 +18,6 @@
#include <__utility/forward.h>
#include <__utility/move.h>
#include <cstddef>
-#include <experimental/__config>
#include <experimental/__simd/utility.h>
_LIBCPP_PUSH_MACROS
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index a2aeeb5..a76933e 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -11,9 +11,9 @@
#define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H
#include <__assert>
+#include <__config>
#include <__type_traits/integral_constant.h>
#include <cstddef>
-#include <experimental/__config>
#include <experimental/__simd/declaration.h>
#include <experimental/__simd/traits.h>
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index db0f9b3..2c65d19 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -10,12 +10,12 @@
#ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H
#define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H
+#include <__config>
#include <__type_traits/enable_if.h>
#include <__type_traits/is_same.h>
#include <__type_traits/remove_cvref.h>
#include <__utility/forward.h>
#include <cstddef>
-#include <experimental/__config>
#include <experimental/__simd/declaration.h>
#include <experimental/__simd/reference.h>
#include <experimental/__simd/traits.h>
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index d54d489..5527319 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -10,10 +10,10 @@
#ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H
#define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H
+#include <__config>
#include <__type_traits/enable_if.h>
#include <__type_traits/is_same.h>
#include <cstddef>
-#include <experimental/__config>
#include <experimental/__simd/declaration.h>
#include <experimental/__simd/reference.h>
#include <experimental/__simd/traits.h>
diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h
index ec25b4b..b817df60 100644
--- a/libcxx/include/experimental/__simd/traits.h
+++ b/libcxx/include/experimental/__simd/traits.h
@@ -11,10 +11,10 @@
#define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
#include <__bit/bit_ceil.h>
+#include <__config>
#include <__type_traits/integral_constant.h>
#include <__type_traits/is_same.h>
#include <cstddef>
-#include <experimental/__config>
#include <experimental/__simd/declaration.h>
#include <experimental/__simd/utility.h>
diff --git a/libcxx/include/experimental/__simd/utility.h b/libcxx/include/experimental/__simd/utility.h
index 708fa3d..0103b06 100644
--- a/libcxx/include/experimental/__simd/utility.h
+++ b/libcxx/include/experimental/__simd/utility.h
@@ -10,6 +10,7 @@
#ifndef _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H
#define _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H
+#include <__config>
#include <__type_traits/is_arithmetic.h>
#include <__type_traits/is_const.h>
#include <__type_traits/is_constant_evaluated.h>
@@ -22,7 +23,6 @@
#include <__utility/integer_sequence.h>
#include <cstddef>
#include <cstdint>
-#include <experimental/__config>
#include <limits>
_LIBCPP_PUSH_MACROS
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 5787f23..1f707cf 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -12,11 +12,11 @@
#include <__assert>
#include <__bit/bit_ceil.h>
+#include <__config>
#include <__type_traits/integral_constant.h>
#include <__utility/forward.h>
#include <__utility/integer_sequence.h>
#include <cstddef>
-#include <experimental/__config>
#include <experimental/__simd/declaration.h>
#include <experimental/__simd/traits.h>
#include <experimental/__simd/utility.h>
diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index de82da2..edfe6e7 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -52,11 +52,11 @@ namespace std {
*/
+#include <__config>
#include <__memory/addressof.h>
#include <__type_traits/decay.h>
#include <__utility/forward.h>
#include <__utility/move.h>
-#include <experimental/__config>
#include <iterator>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/experimental/memory b/libcxx/include/experimental/memory
index e9663d4..bf8a154 100644
--- a/libcxx/include/experimental/memory
+++ b/libcxx/include/experimental/memory
@@ -49,6 +49,7 @@ public:
}
*/
+#include <__config>
#include <__functional/hash.h>
#include <__functional/operations.h>
#include <__type_traits/add_lvalue_reference.h>
@@ -57,7 +58,6 @@ public:
#include <__type_traits/enable_if.h>
#include <__type_traits/is_convertible.h>
#include <cstddef>
-#include <experimental/__config>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
diff --git a/libcxx/include/experimental/propagate_const b/libcxx/include/experimental/propagate_const
index d7a695d..510d374 100644
--- a/libcxx/include/experimental/propagate_const
+++ b/libcxx/include/experimental/propagate_const
@@ -107,6 +107,7 @@
*/
+#include <__config>
#include <__functional/operations.h>
#include <__fwd/functional.h>
#include <__type_traits/conditional.h>
@@ -128,7 +129,6 @@
#include <__utility/move.h>
#include <__utility/swap.h>
#include <cstddef>
-#include <experimental/__config>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd
index 484543b..35120b4 100644
--- a/libcxx/include/experimental/simd
+++ b/libcxx/include/experimental/simd
@@ -75,7 +75,7 @@ inline namespace parallelism_v2 {
# pragma GCC system_header
#endif
-#include <experimental/__config>
+#include <__config>
#include <experimental/__simd/aligned_tag.h>
#include <experimental/__simd/declaration.h>
#include <experimental/__simd/reference.h>
diff --git a/libcxx/include/experimental/type_traits b/libcxx/include/experimental/type_traits
index 31b041b..a4bb59a 100644
--- a/libcxx/include/experimental/type_traits
+++ b/libcxx/include/experimental/type_traits
@@ -68,7 +68,7 @@ inline namespace fundamentals_v1 {
*/
-#include <experimental/__config>
+#include <__config>
#if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/experimental/utility b/libcxx/include/experimental/utility
index 8bd0a05..cbc7ad1 100644
--- a/libcxx/include/experimental/utility
+++ b/libcxx/include/experimental/utility
@@ -30,7 +30,7 @@ inline namespace fundamentals_v1 {
*/
-#include <experimental/__config>
+#include <__config>
#include <utility>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 65df579..add8726d 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -610,10 +610,6 @@ module std_experimental [system] {
header "experimental/utility"
export *
}
- module __config {
- textual header "experimental/__config"
- export *
- }
}
// Convenience method to get all of the above modules in a single import statement.
diff --git a/libcxx/src/any.cpp b/libcxx/src/any.cpp
index b0ad695..eaca2dd 100644
--- a/libcxx/src/any.cpp
+++ b/libcxx/src/any.cpp
@@ -12,7 +12,7 @@ namespace std {
const char* bad_any_cast::what() const noexcept { return "bad any cast"; }
} // namespace std
-#include <experimental/__config>
+#include <__config>
// Preserve std::experimental::any_bad_cast for ABI compatibility
// Even though it no longer exists in a header file
diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp
index 62b474a..4e7e288 100644
--- a/libcxx/src/optional.cpp
+++ b/libcxx/src/optional.cpp
@@ -17,7 +17,7 @@ const char* bad_optional_access::what() const noexcept { return "bad_optional_ac
} // namespace std
-#include <experimental/__config>
+#include <__config>
// Preserve std::experimental::bad_optional_access for ABI compatibility
// Even though it no longer exists in a header file
diff --git a/lld/test/wasm/static-error.s b/lld/test/wasm/static-error.s
new file mode 100644
index 0000000..3557506
--- /dev/null
+++ b/lld/test/wasm/static-error.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o
+// RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o
+
+// RUN: wasm-ld --experimental-pic -pie -o /dev/null %t.o %t.so
+// RUN: not wasm-ld -o /dev/null -static %t.o %t.so 2>&1 | FileCheck %s
+
+// CHECK: attempted static link of dynamic object
+
+.global _start
+_start:
+ .functype _start () -> ()
+ end_function
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index cb8fe25..2de7dca 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -333,9 +333,15 @@ void LinkerDriver::addFile(StringRef path) {
return;
}
case file_magic::bitcode:
- case file_magic::wasm_object:
- files.push_back(createObjectFile(mbref, "", 0, inLib));
+ case file_magic::wasm_object: {
+ auto obj = createObjectFile(mbref, "", 0, inLib);
+ if (config->isStatic && isa<SharedFile>(obj)) {
+ error("attempted static link of dynamic object " + path);
+ break;
+ }
+ files.push_back(obj);
break;
+ }
case file_magic::unknown:
if (mbref.getBuffer().starts_with("#STUB")) {
files.push_back(make<StubFile>(mbref));
diff --git a/lldb/include/lldb/API/SBMemoryRegionInfo.h b/lldb/include/lldb/API/SBMemoryRegionInfo.h
index be55de4..f9a5dc9 100644
--- a/lldb/include/lldb/API/SBMemoryRegionInfo.h
+++ b/lldb/include/lldb/API/SBMemoryRegionInfo.h
@@ -120,7 +120,7 @@ public:
private:
friend class SBProcess;
friend class SBMemoryRegionInfoList;
-
+ friend class SBSaveCoreOptions;
friend class lldb_private::ScriptInterpreter;
lldb_private::MemoryRegionInfo &ref();
diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h
index ba48ba5..c076d3c 100644
--- a/lldb/include/lldb/API/SBSaveCoreOptions.h
+++ b/lldb/include/lldb/API/SBSaveCoreOptions.h
@@ -80,6 +80,17 @@ public:
/// \return True if the thread was removed, false if it was not in the list.
bool RemoveThread(lldb::SBThread thread);
+ /// Add a memory region to save in the core file.
+ ///
+ /// \param region The memory region to save.
+ /// \returns An empty SBError upon success, or an error if the region is
+ /// invalid.
+ /// \note Ranges that overlapped will be unioned into a single region, this
+ /// also supercedes stack minification. Specifying full regions and a
+ /// non-custom core style will include the specified regions and union them
+ /// with all style specific regions.
+ SBError AddMemoryRegionToSave(const SBMemoryRegionInfo &region);
+
/// Reset all options.
void Clear();
diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h
index 172824d..e386271 100644
--- a/lldb/include/lldb/Core/SourceManager.h
+++ b/lldb/include/lldb/Core/SourceManager.h
@@ -74,6 +74,10 @@ public:
const Checksum &GetChecksum() const { return m_checksum; }
+ llvm::once_flag &GetChecksumWarningOnceFlag() {
+ return m_checksum_warning_once_flag;
+ }
+
protected:
/// Set file and update modification time.
void SetSupportFile(lldb::SupportFileSP support_file_sp);
@@ -87,6 +91,9 @@ public:
/// Keep track of the on-disk checksum.
Checksum m_checksum;
+ /// Once flag for emitting a checksum mismatch warning.
+ llvm::once_flag m_checksum_warning_once_flag;
+
// Keep the modification time that this file data is valid for
llvm::sys::TimePoint<> m_mod_time;
diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h
index f4fed46..d90d080 100644
--- a/lldb/include/lldb/Symbol/SaveCoreOptions.h
+++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h
@@ -10,13 +10,15 @@
#define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H
#include "lldb/Utility/FileSpec.h"
-#include "lldb/lldb-forward.h"
-#include "lldb/lldb-types.h"
+#include "lldb/Utility/RangeMap.h"
#include <optional>
+#include <set>
#include <string>
#include <unordered_set>
+using MemoryRanges = lldb_private::RangeVector<lldb::addr_t, lldb::addr_t>;
+
namespace lldb_private {
class SaveCoreOptions {
@@ -38,8 +40,12 @@ public:
Status AddThread(lldb::ThreadSP thread_sp);
bool RemoveThread(lldb::ThreadSP thread_sp);
bool ShouldThreadBeSaved(lldb::tid_t tid) const;
+ bool HasSpecifiedThreads() const;
Status EnsureValidConfiguration(lldb::ProcessSP process_sp) const;
+ const MemoryRanges &GetCoreFileMemoryRanges() const;
+
+ void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo &region);
void Clear();
@@ -51,6 +57,7 @@ private:
std::optional<lldb::SaveCoreStyle> m_style;
lldb::ProcessSP m_process_sp;
std::unordered_set<lldb::tid_t> m_threads_to_save;
+ MemoryRanges m_regions_to_save;
};
} // namespace lldb_private
diff --git a/lldb/include/lldb/Target/CoreFileMemoryRanges.h b/lldb/include/lldb/Target/CoreFileMemoryRanges.h
new file mode 100644
index 0000000..503ecd6
--- /dev/null
+++ b/lldb/include/lldb/Target/CoreFileMemoryRanges.h
@@ -0,0 +1,50 @@
+//===-- CoreFileMemoryRanges.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Utility/RangeMap.h"
+#include "lldb/Utility/Status.h"
+
+#include "llvm/ADT/AddressRanges.h"
+
+#ifndef LLDB_TARGET_COREFILEMEMORYRANGES_H
+#define LLDB_TARGET_COREFILEMEMORYRANGES_H
+
+namespace lldb_private {
+
+struct CoreFileMemoryRange {
+ llvm::AddressRange range; /// The address range to save into the core file.
+ uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits.
+
+ bool operator==(const CoreFileMemoryRange &rhs) const {
+ return range == rhs.range && lldb_permissions == rhs.lldb_permissions;
+ }
+
+ bool operator!=(const CoreFileMemoryRange &rhs) const {
+ return !(*this == rhs);
+ }
+
+ bool operator<(const CoreFileMemoryRange &rhs) const {
+ if (range < rhs.range)
+ return true;
+ if (range == rhs.range)
+ return lldb_permissions < rhs.lldb_permissions;
+ return false;
+ }
+};
+
+class CoreFileMemoryRanges
+ : public lldb_private::RangeDataVector<lldb::addr_t, lldb::addr_t,
+ CoreFileMemoryRange> {
+public:
+ /// Finalize and merge all overlapping ranges in this collection. Ranges
+ /// will be seperated based on permissions.
+ Status FinalizeCoreFileSaveRanges();
+};
+} // namespace lldb_private
+
+#endif // LLDB_TARGET_COREFILEMEMORYRANGES_H
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index c66cfb2..b8c53a4 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -35,6 +35,8 @@
#include "lldb/Host/ProcessLaunchInfo.h"
#include "lldb/Host/ProcessRunLock.h"
#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
+#include "lldb/Target/CoreFileMemoryRanges.h"
#include "lldb/Target/ExecutionContextScope.h"
#include "lldb/Target/InstrumentationRuntime.h"
#include "lldb/Target/Memory.h"
@@ -710,29 +712,6 @@ public:
/// is not supported by the plugin, error otherwise.
virtual llvm::Expected<bool> SaveCore(llvm::StringRef outfile);
- struct CoreFileMemoryRange {
- llvm::AddressRange range; /// The address range to save into the core file.
- uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits.
-
- bool operator==(const CoreFileMemoryRange &rhs) const {
- return range == rhs.range && lldb_permissions == rhs.lldb_permissions;
- }
-
- bool operator!=(const CoreFileMemoryRange &rhs) const {
- return !(*this == rhs);
- }
-
- bool operator<(const CoreFileMemoryRange &rhs) const {
- if (range < rhs.range)
- return true;
- if (range == rhs.range)
- return lldb_permissions < rhs.lldb_permissions;
- return false;
- }
- };
-
- using CoreFileMemoryRanges = std::vector<CoreFileMemoryRange>;
-
/// Helper function for Process::SaveCore(...) that calculates the address
/// ranges that should be saved. This allows all core file plug-ins to save
/// consistent memory ranges given a \a core_style.
diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h
index 8cc382b..433466e 100644
--- a/lldb/include/lldb/Utility/RangeMap.h
+++ b/lldb/include/lldb/Utility/RangeMap.h
@@ -450,6 +450,12 @@ public:
void Append(const Entry &entry) { m_entries.emplace_back(entry); }
+ /// Append a range with data to the vector
+ /// \param B The base of the memory range
+ /// \param S The size of the memory range
+ /// \param T The data associated with the memory range
+ void Append(B &&b, S &&s, T &&t) { m_entries.emplace_back(Entry(b, s, t)); }
+
bool Erase(uint32_t start, uint32_t end) {
if (start >= end || end > m_entries.size())
return false;
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 7bfde8b..938f6e3 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -1222,6 +1222,7 @@ enum SaveCoreStyle {
eSaveCoreFull = 1,
eSaveCoreDirtyOnly = 2,
eSaveCoreStackOnly = 3,
+ eSaveCoreCustomOnly = 4,
};
/// Events that might happen during a trace session.
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 337eff6..5fb288a 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -207,6 +207,7 @@ class StackFrameRecognizer;
class StackFrameRecognizerManager;
class StackID;
class Status;
+class SaveCoreOptions;
class StopInfo;
class Stoppoint;
class StoppointCallbackContext;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index b3c8cda..5bac5cd 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -9,7 +9,6 @@
#ifndef LLDB_LLDB_PRIVATE_INTERFACES_H
#define LLDB_LLDB_PRIVATE_INTERFACES_H
-#include "lldb/Symbol/SaveCoreOptions.h"
#include "lldb/lldb-enumerations.h"
#include "lldb/lldb-forward.h"
#include "lldb/lldb-private-enumerations.h"
diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp
index ef82b02..c79b57f 100644
--- a/lldb/source/API/SBSaveCoreOptions.cpp
+++ b/lldb/source/API/SBSaveCoreOptions.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "lldb/API/SBSaveCoreOptions.h"
+#include "lldb/API/SBMemoryRegionInfo.h"
#include "lldb/Host/FileSystem.h"
#include "lldb/Symbol/SaveCoreOptions.h"
#include "lldb/Utility/Instrumentation.h"
@@ -89,6 +90,16 @@ bool SBSaveCoreOptions::RemoveThread(lldb::SBThread thread) {
return m_opaque_up->RemoveThread(thread.GetSP());
}
+lldb::SBError
+SBSaveCoreOptions::AddMemoryRegionToSave(const SBMemoryRegionInfo &region) {
+ LLDB_INSTRUMENT_VA(this, region);
+ // Currently add memory region can't fail, so we always return a success
+ // SBerror, but because these API's live forever, this is the most future
+ // proof thing to do.
+ m_opaque_up->AddMemoryRegionToSave(region.ref());
+ return SBError();
+}
+
void SBSaveCoreOptions::Clear() {
LLDB_INSTRUMENT_VA(this);
m_opaque_up->Clear();
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index 25eb633..5b0f4f6 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -25,6 +25,7 @@
#include "lldb/Interpreter/OptionArgParser.h"
#include "lldb/Interpreter/OptionGroupPythonClassWithDict.h"
#include "lldb/Interpreter/Options.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
#include "lldb/Target/Platform.h"
#include "lldb/Target/Process.h"
#include "lldb/Target/StopInfo.h"
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index f97d86a..fd5b499 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -61,6 +61,12 @@ static void resolve_tilde(FileSpec &file_spec) {
}
}
+static std::string toString(const Checksum &checksum) {
+ if (!checksum)
+ return "";
+ return std::string(llvm::formatv("{0}", checksum.digest()));
+}
+
// SourceManager constructor
SourceManager::SourceManager(const TargetSP &target_sp)
: m_last_support_file_sp(std::make_shared<SupportFile>()), m_last_line(0),
@@ -302,6 +308,18 @@ size_t SourceManager::DisplaySourceLinesWithLineNumbersUsingLastFile(
break;
}
}
+
+ Checksum line_table_checksum =
+ last_file_sp->GetSupportFile()->GetChecksum();
+ Checksum on_disk_checksum = last_file_sp->GetChecksum();
+ if (line_table_checksum && line_table_checksum != on_disk_checksum)
+ Debugger::ReportWarning(
+ llvm::formatv(
+ "{0}: source file checksum mismatch between line table "
+ "({1}) and file on disk ({2})",
+ last_file_sp->GetSupportFile()->GetSpecOnly().GetFilename(),
+ toString(line_table_checksum), toString(on_disk_checksum)),
+ std::nullopt, &last_file_sp->GetChecksumWarningOnceFlag());
}
return *delta;
}
@@ -837,12 +855,6 @@ SourceManager::FileSP SourceManager::SourceFileCache::FindSourceFile(
return {};
}
-static std::string toString(const Checksum &checksum) {
- if (!checksum)
- return "";
- return std::string(llvm::formatv("{0}", checksum.digest()));
-}
-
void SourceManager::SourceFileCache::Dump(Stream &stream) const {
// clang-format off
stream << "Modification time MD5 Checksum (on-disk) MD5 Checksum (line table) Lines Path\n";
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index b28beab..06da83e 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -6562,13 +6562,15 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp,
}
if (make_core) {
- Process::CoreFileMemoryRanges core_ranges;
+ CoreFileMemoryRanges core_ranges;
error = process_sp->CalculateCoreFileSaveRanges(options, core_ranges);
if (error.Success()) {
const uint32_t addr_byte_size = target_arch.GetAddressByteSize();
const ByteOrder byte_order = target_arch.GetByteOrder();
std::vector<llvm::MachO::segment_command_64> segment_load_commands;
- for (const auto &core_range : core_ranges) {
+ for (const auto &core_range_info : core_ranges) {
+ // TODO: Refactor RangeDataVector to have a data iterator.
+ const auto &core_range = core_range_info.data;
uint32_t cmd_type = LC_SEGMENT_64;
uint32_t segment_size = sizeof(llvm::MachO::segment_command_64);
if (addr_byte_size == 4) {
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
index 27bc237..be87112 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h
@@ -12,6 +12,7 @@
#include "lldb/Core/Address.h"
#include "lldb/Host/SafeMachO.h"
#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
#include "lldb/Utility/FileSpec.h"
#include "lldb/Utility/FileSpecList.h"
#include "lldb/Utility/RangeMap.h"
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index 5c9ba22..edc568a 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -831,25 +831,32 @@ Status MinidumpFileBuilder::AddMemoryList() {
// bytes of the core file. Thread structures in minidump files can only use
// 32 bit memory descriptiors, so we emit them first to ensure the memory is
// in accessible with a 32 bit offset.
- Process::CoreFileMemoryRanges ranges_32;
- Process::CoreFileMemoryRanges ranges_64;
- Process::CoreFileMemoryRanges all_core_memory_ranges;
+ std::vector<CoreFileMemoryRange> ranges_32;
+ std::vector<CoreFileMemoryRange> ranges_64;
+ CoreFileMemoryRanges all_core_memory_ranges;
error = m_process_sp->CalculateCoreFileSaveRanges(m_save_core_options,
all_core_memory_ranges);
+
+ std::vector<CoreFileMemoryRange> all_core_memory_vec;
+ // Extract all the data into just a vector of data. So we can mutate this in
+ // place.
+ for (const auto &core_range : all_core_memory_ranges)
+ all_core_memory_vec.push_back(core_range.data);
+
if (error.Fail())
return error;
// Start by saving all of the stacks and ensuring they fit under the 32b
// limit.
uint64_t total_size = GetCurrentDataEndOffset();
- auto iterator = all_core_memory_ranges.begin();
- while (iterator != all_core_memory_ranges.end()) {
+ auto iterator = all_core_memory_vec.begin();
+ while (iterator != all_core_memory_vec.end()) {
if (m_saved_stack_ranges.count(iterator->range.start()) > 0) {
// We don't save stacks twice.
ranges_32.push_back(*iterator);
total_size +=
iterator->range.size() + sizeof(llvm::minidump::MemoryDescriptor);
- iterator = all_core_memory_ranges.erase(iterator);
+ iterator = all_core_memory_vec.erase(iterator);
} else {
iterator++;
}
@@ -869,11 +876,11 @@ Status MinidumpFileBuilder::AddMemoryList() {
// Then anything overflow extends into 64b addressable space.
// All core memeroy ranges will either container nothing on stacks only
// or all the memory ranges including stacks
- if (!all_core_memory_ranges.empty())
- total_size += 256 + (all_core_memory_ranges.size() *
+ if (!all_core_memory_vec.empty())
+ total_size += 256 + (all_core_memory_vec.size() *
sizeof(llvm::minidump::MemoryDescriptor_64));
- for (const auto &core_range : all_core_memory_ranges) {
+ for (const auto &core_range : all_core_memory_vec) {
const addr_t range_size = core_range.range.size();
// We don't need to check for stacks here because we already removed them
// from all_core_memory_ranges.
@@ -958,15 +965,15 @@ Status MinidumpFileBuilder::DumpDirectories() const {
}
static uint64_t
-GetLargestRangeSize(const Process::CoreFileMemoryRanges &ranges) {
+GetLargestRangeSize(const std::vector<CoreFileMemoryRange> &ranges) {
uint64_t max_size = 0;
for (const auto &core_range : ranges)
max_size = std::max(max_size, core_range.range.size());
return max_size;
}
-Status
-MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) {
+Status MinidumpFileBuilder::AddMemoryList_32(
+ std::vector<CoreFileMemoryRange> &ranges) {
std::vector<MemoryDescriptor> descriptors;
Status error;
if (ranges.size() == 0)
@@ -1042,8 +1049,8 @@ MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) {
return error;
}
-Status
-MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) {
+Status MinidumpFileBuilder::AddMemoryList_64(
+ std::vector<CoreFileMemoryRange> &ranges) {
Status error;
if (ranges.empty())
return error;
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index 762de83..71001e2 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -23,6 +23,7 @@
#include <utility>
#include <variant>
+#include "lldb/Symbol/SaveCoreOptions.h"
#include "lldb/Target/Process.h"
#include "lldb/Target/Target.h"
#include "lldb/Utility/DataBufferHeap.h"
@@ -120,9 +121,9 @@ private:
lldb_private::Status AddData(const void *data, uint64_t size);
// Add MemoryList stream, containing dumps of important memory segments
lldb_private::Status
- AddMemoryList_64(lldb_private::Process::CoreFileMemoryRanges &ranges);
+ AddMemoryList_64(std::vector<lldb_private::CoreFileMemoryRange> &ranges);
lldb_private::Status
- AddMemoryList_32(lldb_private::Process::CoreFileMemoryRanges &ranges);
+ AddMemoryList_32(std::vector<lldb_private::CoreFileMemoryRange> &ranges);
// Update the thread list on disk with the newly emitted stack RVAs.
lldb_private::Status FixThreadStacks();
lldb_private::Status FlushBufferToDisk();
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
index b76fcd0..2f45f01 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h
@@ -21,6 +21,7 @@
#define LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_OBJECTFILEMINIDUMP_H
#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
#include "lldb/Utility/ArchSpec.h"
class ObjectFileMinidump : public lldb_private::PluginInterface {
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 9d01089..8d9c919 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -17,6 +17,7 @@
#include "lldb/Interpreter/OptionValueDictionary.h"
#include "lldb/Interpreter/OptionValueProperties.h"
#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
#include "lldb/Target/Process.h"
#include "lldb/Target/SectionLoadList.h"
#include "lldb/Target/Target.h"
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
index 8bccf3b..4f4dedf 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
@@ -13,6 +13,7 @@
#include <vector>
#include "lldb/Symbol/ObjectFile.h"
+#include "lldb/Symbol/SaveCoreOptions.h"
#include "llvm/Object/COFF.h"
class ObjectFilePECOFF : public lldb_private::ObjectFile {
diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp
index 35943726..8d9aade 100644
--- a/lldb/source/Symbol/SaveCoreOptions.cpp
+++ b/lldb/source/Symbol/SaveCoreOptions.cpp
@@ -102,6 +102,19 @@ bool SaveCoreOptions::ShouldThreadBeSaved(lldb::tid_t tid) const {
return m_threads_to_save.count(tid) > 0;
}
+bool SaveCoreOptions::HasSpecifiedThreads() const {
+ return !m_threads_to_save.empty();
+}
+
+void SaveCoreOptions::AddMemoryRegionToSave(
+ const lldb_private::MemoryRegionInfo &region) {
+ m_regions_to_save.Insert(region.GetRange(), /*combine=*/true);
+}
+
+const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const {
+ return m_regions_to_save;
+}
+
Status SaveCoreOptions::EnsureValidConfiguration(
lldb::ProcessSP process_sp) const {
Status error;
@@ -131,4 +144,5 @@ void SaveCoreOptions::Clear() {
m_style = std::nullopt;
m_threads_to_save.clear();
m_process_sp.reset();
+ m_regions_to_save.Clear();
}
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index a42c44b..a6d2eac 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -11,6 +11,7 @@ add_lldb_library(lldbTarget
ABI.cpp
AssertFrameRecognizer.cpp
DynamicRegisterInfo.cpp
+ CoreFileMemoryRanges.cpp
ExecutionContext.cpp
InstrumentationRuntime.cpp
InstrumentationRuntimeStopInfo.cpp
diff --git a/lldb/source/Target/CoreFileMemoryRanges.cpp b/lldb/source/Target/CoreFileMemoryRanges.cpp
new file mode 100644
index 0000000..6e4ca49
--- /dev/null
+++ b/lldb/source/Target/CoreFileMemoryRanges.cpp
@@ -0,0 +1,86 @@
+//===-- CoreFileMemoryRanges.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Target/CoreFileMemoryRanges.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+using Entry = CoreFileMemoryRanges::Entry;
+
+static bool Overlaps(const Entry *region_one, const Entry *region_two) {
+ return !(region_one->GetRangeEnd() < region_two->GetRangeBase() ||
+ region_two->GetRangeEnd() < region_one->GetRangeBase());
+}
+
+static bool IntersectHelper(const Entry *region_one, const Entry *region_two) {
+ return region_one->GetRangeBase() == region_two->GetRangeEnd() ||
+ region_one->GetRangeEnd() == region_two->GetRangeBase();
+}
+
+static bool OnlyIntersects(const Entry *region_one, const Entry *region_two) {
+ return IntersectHelper(region_one, region_two) ||
+ IntersectHelper(region_two, region_one);
+}
+
+static bool PermissionsMatch(const Entry *region_one, const Entry *region_two) {
+ return region_one->data.lldb_permissions == region_two->data.lldb_permissions;
+}
+
+// This assumes any overlapping ranges will share the same permissions
+// and that adjacent ranges could have different permissions.
+Status CoreFileMemoryRanges::FinalizeCoreFileSaveRanges() {
+ Status error;
+ this->Sort();
+ for (size_t i = this->GetSize() - 1; i > 0; i--) {
+ auto region_one = this->GetMutableEntryAtIndex(i);
+ auto region_two = this->GetMutableEntryAtIndex(i - 1);
+ if (Overlaps(region_one, region_two)) {
+ // It's okay for interesecting regions to have different permissions but
+ // if they overlap we fail because we don't know what to do with them.
+ if (!PermissionsMatch(region_one, region_two)) {
+ // Permissions mismatch and it's not a simple intersection.
+ if (!OnlyIntersects(region_one, region_two)) {
+ error = Status::FromErrorStringWithFormatv(
+ "Memory region at {0}::{1} has different permssions than "
+ "overlapping region at {2}::{3}",
+ region_one->GetRangeBase(), region_one->GetRangeEnd(),
+ region_two->GetRangeBase(), region_two->GetRangeEnd());
+ return error;
+ }
+ // Simple intersection, we can just not merge these.
+ else
+ continue;
+ }
+ const addr_t base =
+ std::min(region_one->GetRangeBase(), region_two->GetRangeBase());
+ const addr_t byte_size =
+ std::max(region_one->GetRangeEnd(), region_two->GetRangeEnd()) - base;
+
+ region_two->SetRangeBase(base);
+ region_two->SetByteSize(byte_size);
+
+ // Because this is a range data vector, the entry has a base as well
+ // as the data contained in the entry. So we have to update both.
+ // And llvm::AddressRange isn't mutable so we have to create a new one.
+ llvm::AddressRange range(base, base + byte_size);
+ const CoreFileMemoryRange core_range = {
+ range, region_two->data.lldb_permissions};
+ region_two->data = core_range;
+ // Erase is delete from [Inclusive, exclusive index).
+ if (!this->Erase(i, i + 1)) {
+ error = Status::FromErrorStringWithFormat(
+ "Core file memory ranges mutated outside of "
+ "CalculateCoreFileSaveRanges");
+ return error;
+ }
+ }
+ }
+
+ return error;
+}
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 40f3115..aca0897 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -6463,7 +6463,7 @@ Status Process::WriteMemoryTags(lldb::addr_t addr, size_t len,
}
// Create a CoreFileMemoryRange from a MemoryRegionInfo
-static Process::CoreFileMemoryRange
+static CoreFileMemoryRange
CreateCoreFileMemoryRange(const MemoryRegionInfo &region) {
const addr_t addr = region.GetRange().GetRangeBase();
llvm::AddressRange range(addr, addr + region.GetRange().GetByteSize());
@@ -6474,7 +6474,7 @@ CreateCoreFileMemoryRange(const MemoryRegionInfo &region) {
// were added. Return false if the dirty page information is not valid or in
// the region.
static bool AddDirtyPages(const MemoryRegionInfo &region,
- Process::CoreFileMemoryRanges &ranges) {
+ CoreFileMemoryRanges &ranges) {
const auto &dirty_page_list = region.GetDirtyPageList();
if (!dirty_page_list)
return false;
@@ -6494,14 +6494,14 @@ static bool AddDirtyPages(const MemoryRegionInfo &region,
} else {
// Add previous contiguous range and init the new range with the
// current dirty page.
- ranges.push_back({range, lldb_permissions});
+ ranges.Append(range.start(), range.size(), {range, lldb_permissions});
range = llvm::AddressRange(page_addr, page_addr + page_size);
}
}
}
// The last range
if (!range.empty())
- ranges.push_back({range, lldb_permissions});
+ ranges.Append(range.start(), range.size(), {range, lldb_permissions});
return true;
}
@@ -6513,7 +6513,7 @@ static bool AddDirtyPages(const MemoryRegionInfo &region,
// will be added to \a ranges, else the entire range will be added to \a
// ranges.
static void AddRegion(const MemoryRegionInfo &region, bool try_dirty_pages,
- Process::CoreFileMemoryRanges &ranges) {
+ CoreFileMemoryRanges &ranges) {
// Don't add empty ranges.
if (region.GetRange().GetByteSize() == 0)
return;
@@ -6522,13 +6522,17 @@ static void AddRegion(const MemoryRegionInfo &region, bool try_dirty_pages,
return;
if (try_dirty_pages && AddDirtyPages(region, ranges))
return;
- ranges.push_back(CreateCoreFileMemoryRange(region));
+
+ ranges.Append(region.GetRange().GetRangeBase(),
+ region.GetRange().GetByteSize(),
+ CreateCoreFileMemoryRange(region));
}
-static void SaveOffRegionsWithStackPointers(
- Process &process, const SaveCoreOptions &core_options,
- const MemoryRegionInfos &regions, Process::CoreFileMemoryRanges &ranges,
- std::set<addr_t> &stack_ends) {
+static void SaveOffRegionsWithStackPointers(Process &process,
+ const SaveCoreOptions &core_options,
+ const MemoryRegionInfos &regions,
+ CoreFileMemoryRanges &ranges,
+ std::set<addr_t> &stack_ends) {
const bool try_dirty_pages = true;
// Before we take any dump, we want to save off the used portions of the
@@ -6568,11 +6572,11 @@ static void SaveOffRegionsWithStackPointers(
// for a full core file style.
static void GetCoreFileSaveRangesFull(Process &process,
const MemoryRegionInfos &regions,
- Process::CoreFileMemoryRanges &ranges,
+ CoreFileMemoryRanges &ranges,
std::set<addr_t> &stack_ends) {
// Don't add only dirty pages, add full regions.
-const bool try_dirty_pages = false;
+ const bool try_dirty_pages = false;
for (const auto &region : regions)
if (stack_ends.count(region.GetRange().GetRangeEnd()) == 0)
AddRegion(region, try_dirty_pages, ranges);
@@ -6582,9 +6586,10 @@ const bool try_dirty_pages = false;
// least some dirty pages, as some OS versions don't support reporting what
// pages are dirty within an memory region. If no memory regions have dirty
// page information fall back to saving out all ranges with write permissions.
-static void GetCoreFileSaveRangesDirtyOnly(
- Process &process, const MemoryRegionInfos &regions,
- Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) {
+static void GetCoreFileSaveRangesDirtyOnly(Process &process,
+ const MemoryRegionInfos &regions,
+ CoreFileMemoryRanges &ranges,
+ std::set<addr_t> &stack_ends) {
// Iterate over the regions and find all dirty pages.
bool have_dirty_page_info = false;
@@ -6613,9 +6618,10 @@ static void GetCoreFileSaveRangesDirtyOnly(
// dirty regions as this will make the core file smaller. If the process
// doesn't support dirty regions, then it will fall back to adding the full
// stack region.
-static void GetCoreFileSaveRangesStackOnly(
- Process &process, const MemoryRegionInfos &regions,
- Process::CoreFileMemoryRanges &ranges, std::set<addr_t> &stack_ends) {
+static void GetCoreFileSaveRangesStackOnly(Process &process,
+ const MemoryRegionInfos &regions,
+ CoreFileMemoryRanges &ranges,
+ std::set<addr_t> &stack_ends) {
const bool try_dirty_pages = true;
// Some platforms support annotating the region information that tell us that
// it comes from a thread stack. So look for those regions first.
@@ -6628,6 +6634,24 @@ static void GetCoreFileSaveRangesStackOnly(
}
}
+static void GetUserSpecifiedCoreFileSaveRanges(Process &process,
+ const MemoryRegionInfos &regions,
+ const SaveCoreOptions &options,
+ CoreFileMemoryRanges &ranges) {
+ const auto &option_ranges = options.GetCoreFileMemoryRanges();
+ if (option_ranges.IsEmpty())
+ return;
+
+ for (const auto &range : regions) {
+ auto entry = option_ranges.FindEntryThatContains(range.GetRange());
+ if (entry) {
+ ranges.Append(range.GetRange().GetRangeBase(),
+ range.GetRange().GetByteSize(),
+ CreateCoreFileMemoryRange(range));
+ }
+ }
+}
+
Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
CoreFileMemoryRanges &ranges) {
lldb_private::MemoryRegionInfos regions;
@@ -6643,11 +6667,18 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
"callers must set the core_style to something other than "
"eSaveCoreUnspecified");
+ GetUserSpecifiedCoreFileSaveRanges(*this, regions, options, ranges);
+
std::set<addr_t> stack_ends;
- SaveOffRegionsWithStackPointers(*this, options, regions, ranges, stack_ends);
+ // For fully custom set ups, we don't want to even look at threads if there
+ // are no threads specified.
+ if (core_style != lldb::eSaveCoreCustomOnly || options.HasSpecifiedThreads())
+ SaveOffRegionsWithStackPointers(*this, options, regions, ranges,
+ stack_ends);
switch (core_style) {
case eSaveCoreUnspecified:
+ case eSaveCoreCustomOnly:
break;
case eSaveCoreFull:
@@ -6666,10 +6697,11 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options,
if (err.Fail())
return err;
- if (ranges.empty())
- return Status("no valid address ranges found for core style");
+ if (ranges.IsEmpty())
+ return Status::FromErrorStringWithFormat(
+ "no valid address ranges found for core style");
- return Status(); // Success!
+ return ranges.FinalizeCoreFileSaveRanges();
}
std::vector<ThreadSP>
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index ed15793..2cbe20e 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -344,3 +344,152 @@ class ProcessSaveCoreMinidumpTestCase(TestBase):
self.assertTrue(self.dbg.DeleteTarget(target))
if os.path.isfile(default_value_file):
os.unlink(default_value_file)
+
+ @skipUnlessArch("x86_64")
+ @skipUnlessPlatform(["linux"])
+ def test_save_linux_minidump_one_region(self):
+ """Test that we can save a Linux mini dump with one region in sbsavecore regions"""
+
+ self.build()
+ exe = self.getBuildArtifact("a.out")
+ one_region_file = self.getBuildArtifact("core.one_region.dmp")
+ try:
+ target = self.dbg.CreateTarget(exe)
+ process = target.LaunchSimple(
+ None, None, self.get_process_working_directory()
+ )
+ self.assertState(process.GetState(), lldb.eStateStopped)
+
+ memory_region = lldb.SBMemoryRegionInfo()
+ memory_list = process.GetMemoryRegions()
+ memory_list.GetMemoryRegionAtIndex(0, memory_region)
+
+ # This is almost identical to the single thread test case because
+ # minidump defaults to stacks only, so we want to see if the
+ # default options work as expected.
+ options = lldb.SBSaveCoreOptions()
+ file_spec = lldb.SBFileSpec(one_region_file)
+ options.SetOutputFile(file_spec)
+ options.SetPluginName("minidump")
+ options.AddMemoryRegionToSave(memory_region)
+ options.SetStyle(lldb.eSaveCoreCustomOnly)
+ error = process.SaveCore(options)
+ print(f"Error: {error.GetCString()}")
+ self.assertTrue(error.Success(), error.GetCString())
+
+ core_target = self.dbg.CreateTarget(None)
+ core_proc = core_target.LoadCore(one_region_file)
+ core_memory_list = core_proc.GetMemoryRegions()
+ # Note because the /proc/pid maps are included on linux, we can't
+ # depend on size for validation, so we'll ensure the first region
+ # is present and then assert we fail on the second.
+ core_memory_region = lldb.SBMemoryRegionInfo()
+ core_memory_list.GetMemoryRegionAtIndex(0, core_memory_region)
+ self.assertEqual(
+ core_memory_region.GetRegionBase(), memory_region.GetRegionBase()
+ )
+ self.assertEqual(
+ core_memory_region.GetRegionEnd(), memory_region.GetRegionEnd()
+ )
+
+ region_two = lldb.SBMemoryRegionInfo()
+ core_memory_list.GetMemoryRegionAtIndex(1, region_two)
+ err = lldb.SBError()
+ content = core_proc.ReadMemory(region_two.GetRegionBase(), 1, err)
+ self.assertTrue(err.Fail(), "Should fail to read memory")
+
+ finally:
+ self.assertTrue(self.dbg.DeleteTarget(target))
+ if os.path.isfile(one_region_file):
+ os.unlink(one_region_file)
+
+ @skipUnlessArch("x86_64")
+ @skipUnlessPlatform(["linux"])
+ def test_save_minidump_custom_save_style(self):
+ """Test that verifies a custom and unspecified save style fails for
+ containing no data to save"""
+
+ self.build()
+ exe = self.getBuildArtifact("a.out")
+ custom_file = self.getBuildArtifact("core.custom.dmp")
+ try:
+ target = self.dbg.CreateTarget(exe)
+ process = target.LaunchSimple(
+ None, None, self.get_process_working_directory()
+ )
+ self.assertState(process.GetState(), lldb.eStateStopped)
+
+ options = lldb.SBSaveCoreOptions()
+ options.SetOutputFile(lldb.SBFileSpec(custom_file))
+ options.SetPluginName("minidump")
+ options.SetStyle(lldb.eSaveCoreCustomOnly)
+
+ error = process.SaveCore(options)
+ self.assertTrue(error.Fail())
+ self.assertEqual(
+ error.GetCString(), "no valid address ranges found for core style"
+ )
+
+ finally:
+ self.assertTrue(self.dbg.DeleteTarget(target))
+ if os.path.isfile(custom_file):
+ os.unlink(custom_file)
+
+ def save_core_with_region(self, process, region_index):
+ try:
+ custom_file = self.getBuildArtifact("core.custom.dmp")
+ memory_region = lldb.SBMemoryRegionInfo()
+ memory_list = process.GetMemoryRegions()
+ memory_list.GetMemoryRegionAtIndex(0, memory_region)
+ options = lldb.SBSaveCoreOptions()
+ options.SetOutputFile(lldb.SBFileSpec(custom_file))
+ options.SetPluginName("minidump")
+ options.SetStyle(lldb.eSaveCoreFull)
+
+ error = process.SaveCore(options)
+ self.assertTrue(error.Success())
+ core_target = self.dbg.CreateTarget(None)
+ core_proc = core_target.LoadCore(custom_file)
+ core_memory_list = core_proc.GetMemoryRegions()
+ # proc/pid/ maps are included on linux, so we can't depend on size
+ # for validation, we make a set of all the ranges,
+ # and ensure no duplicates!
+ range_set = set()
+ for x in range(core_memory_list.GetSize()):
+ core_memory_region = lldb.SBMemoryRegionInfo()
+ core_memory_list.GetMemoryRegionAtIndex(x, core_memory_region)
+ mem_tuple = (
+ core_memory_region.GetRegionBase(),
+ core_memory_region.GetRegionEnd(),
+ )
+ self.assertTrue(
+ mem_tuple not in range_set, "Duplicate memory region found"
+ )
+ range_set.add(mem_tuple)
+ finally:
+ if os.path.isfile(custom_file):
+ os.unlink(custom_file)
+
+ @skipUnlessArch("x86_64")
+ @skipUnlessPlatform(["linux"])
+ def test_save_minidump_custom_save_style_duplicated_regions(self):
+ """Test that verifies a custom and unspecified save style fails for
+ containing no data to save"""
+
+ self.build()
+ exe = self.getBuildArtifact("a.out")
+ try:
+ target = self.dbg.CreateTarget(exe)
+ process = target.LaunchSimple(
+ None, None, self.get_process_working_directory()
+ )
+ self.assertState(process.GetState(), lldb.eStateStopped)
+
+ memory_list = process.GetMemoryRegions()
+ # Test that we don't duplicate regions, by duplicating regions
+ # at various indices.
+ self.save_core_with_region(process, 0)
+ self.save_core_with_region(process, len(memory_list) - 1)
+
+ finally:
+ self.assertTrue(self.dbg.DeleteTarget(target))
diff --git a/lldb/test/Shell/SymbolFile/Inputs/main.c b/lldb/test/Shell/SymbolFile/Inputs/main.c
new file mode 100644
index 0000000..341417f
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/Inputs/main.c
@@ -0,0 +1,4 @@
+int main(int argc, char **argv) {
+ // Break on main.
+ return 1;
+}
diff --git a/lldb/test/Shell/SymbolFile/checksum-mismatch.test b/lldb/test/Shell/SymbolFile/checksum-mismatch.test
new file mode 100644
index 0000000..5db9764
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/checksum-mismatch.test
@@ -0,0 +1,7 @@
+RUN: mkdir -p %t
+RUN: cp %S/Inputs/main.c %t/main.c
+RUN: %clang_host %t/main.c -std=c99 -gdwarf-5 -o %t/main.out
+RUN: echo "// Modify source file hash" >> %t/main.c
+RUN: %lldb -b %t/main.out -o 'b main' -o 'r' 2>&1 | FileCheck %s
+
+CHECK: warning: main.c: source file checksum mismatch between line table ({{.*}}) and file on disk ({{.*}})
diff --git a/lldb/unittests/Process/Utility/CMakeLists.txt b/lldb/unittests/Process/Utility/CMakeLists.txt
index 651f871..ec0ff95 100644
--- a/lldb/unittests/Process/Utility/CMakeLists.txt
+++ b/lldb/unittests/Process/Utility/CMakeLists.txt
@@ -18,6 +18,7 @@ add_lldb_unittest(ProcessUtilityTests
LinuxProcMapsTest.cpp
MemoryTagManagerAArch64MTETest.cpp
RegisterContextTest.cpp
+ CoreFileMemoryRangesTest.cpp
${PLATFORM_SOURCES}
LINK_LIBS
diff --git a/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp
new file mode 100644
index 0000000..6d514b1
--- /dev/null
+++ b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp
@@ -0,0 +1,205 @@
+//===-- CoreFileMemoryRangesTests.cpp
+//---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "lldb/Target/CoreFileMemoryRanges.h"
+#include "lldb/lldb-types.h"
+
+using namespace lldb_private;
+
+TEST(CoreFileMemoryRangesTest, MapOverlappingRanges) {
+ lldb_private::CoreFileMemoryRanges ranges;
+ const lldb::addr_t start_addr = 0x1000;
+ const lldb::addr_t increment_addr = 0x1000;
+ const size_t iterations = 10;
+ for (size_t i = 0; i < iterations; i++) {
+ const lldb::addr_t start = start_addr + (i * increment_addr);
+ const lldb::addr_t end = start + increment_addr;
+ // Arbitrary value
+ const uint32_t permissions = 0x3;
+ llvm::AddressRange range(start, end);
+ const CoreFileMemoryRange core_range = {range, permissions};
+ // The range data is Start, Size, While the range is start-end.
+ CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+ ranges.Append(entry);
+ }
+
+ Status error = ranges.FinalizeCoreFileSaveRanges();
+ EXPECT_TRUE(error.Success());
+ ASSERT_THAT(1, ranges.GetSize());
+ const auto range = ranges.GetEntryAtIndex(0);
+ ASSERT_TRUE(range);
+ ASSERT_THAT(start_addr, range->GetRangeBase());
+ ASSERT_THAT(start_addr + (iterations * increment_addr), range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, RangesSplitByPermissions) {
+ lldb_private::CoreFileMemoryRanges ranges;
+ const lldb::addr_t start_addr = 0x1000;
+ const lldb::addr_t increment_addr = 0x1000;
+ const size_t iterations = 10;
+ for (size_t i = 0; i < iterations; i++) {
+ const lldb::addr_t start = start_addr + (i * increment_addr);
+ const lldb::addr_t end = start + increment_addr;
+ const uint32_t permissions = i;
+ llvm::AddressRange range(start, end);
+ const CoreFileMemoryRange core_range = {range, permissions};
+ // The range data is Start, Size, While the range is start-end.
+ CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+ ranges.Append(entry);
+ }
+
+ Status error = ranges.FinalizeCoreFileSaveRanges();
+ EXPECT_TRUE(error.Success());
+ ASSERT_THAT(10, ranges.GetSize());
+ const auto range = ranges.GetEntryAtIndex(0);
+ ASSERT_TRUE(range);
+ ASSERT_THAT(start_addr, range->GetRangeBase());
+ ASSERT_THAT(start_addr + increment_addr, range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, MapPartialOverlappingRanges) {
+ lldb_private::CoreFileMemoryRanges ranges;
+ const lldb::addr_t start_addr = 0x1000;
+ const lldb::addr_t increment_addr = 0x1000;
+ const size_t iterations = 10;
+ for (size_t i = 0; i < iterations; i++) {
+ const lldb::addr_t start = start_addr + (i * increment_addr);
+ const lldb::addr_t end = start + increment_addr;
+ // Arbitrary value
+ const uint32_t permissions = 0x3;
+ llvm::AddressRange range(start, end);
+ const CoreFileMemoryRange core_range = {range, permissions};
+ // The range data is Start, Size, While the range is start-end.
+ CoreFileMemoryRanges::Entry entry = {start, end - start, core_range};
+ ranges.Append(entry);
+ }
+
+ const lldb::addr_t unique_start = 0x7fff0000;
+ const lldb::addr_t unique_end = unique_start + increment_addr;
+ llvm::AddressRange range(unique_start, unique_end);
+ const uint32_t permissions = 0x3;
+ const CoreFileMemoryRange core_range = {range, permissions};
+ // The range data is Start, Size, While the range is start-end.
+ CoreFileMemoryRanges::Entry entry = {unique_start, unique_end - unique_start,
+ core_range};
+ ranges.Append(entry);
+
+ Status error = ranges.FinalizeCoreFileSaveRanges();
+ EXPECT_TRUE(error.Success());
+ ASSERT_THAT(2, ranges.GetSize());
+ const auto merged_range = ranges.GetEntryAtIndex(0);
+ ASSERT_TRUE(merged_range);
+ ASSERT_THAT(start_addr, merged_range->GetRangeBase());
+ ASSERT_THAT(start_addr + (iterations * increment_addr),
+ merged_range->GetRangeEnd());
+ const auto unique_range = ranges.GetEntryAtIndex(1);
+ ASSERT_TRUE(unique_range);
+ ASSERT_THAT(unique_start, unique_range->GetRangeBase());
+ ASSERT_THAT(unique_end, unique_range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_SamePermissions) {
+ lldb_private::CoreFileMemoryRanges ranges;
+ const lldb::addr_t start_addr = 0x1000;
+ const lldb::addr_t increment_addr = 0x1000;
+ const lldb::addr_t superior_region_end = start_addr + increment_addr * 10;
+ llvm::AddressRange range(start_addr, superior_region_end);
+ const CoreFileMemoryRange core_range = {range, 0x3};
+ CoreFileMemoryRanges::Entry entry = {
+ start_addr, superior_region_end - start_addr, core_range};
+ ranges.Append(entry);
+ const lldb::addr_t inferior_region_end = start_addr + increment_addr;
+ llvm::AddressRange inferior_range(start_addr, inferior_region_end);
+ const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x3};
+ CoreFileMemoryRanges::Entry inferior_entry = {
+ start_addr, inferior_region_end - start_addr, inferior_core_range};
+ ranges.Append(inferior_entry);
+
+ Status error = ranges.FinalizeCoreFileSaveRanges();
+ EXPECT_TRUE(error.Success());
+ ASSERT_THAT(1, ranges.GetSize());
+ const auto searched_range = ranges.GetEntryAtIndex(0);
+ ASSERT_TRUE(searched_range);
+ ASSERT_THAT(start_addr, searched_range->GetRangeBase());
+ ASSERT_THAT(superior_region_end, searched_range->GetRangeEnd());
+}
+
+TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_DifferentPermissions) {
+ lldb_private::CoreFileMemoryRanges ranges;
+ const lldb::addr_t start_addr = 0x1000;
+ const lldb::addr_t increment_addr = 0x1000;
+ const lldb::addr_t superior_region_end = start_addr + increment_addr * 10;
+ llvm::AddressRange range(start_addr, superior_region_end);
+ const CoreFileMemoryRange core_range = {range, 0x3};
+ CoreFileMemoryRanges::Entry entry = {
+ start_addr, superior_region_end - start_addr, core_range};
+ ranges.Append(entry);
+ const lldb::addr_t inferior_region_end = start_addr + increment_addr;
+ llvm::AddressRange inferior_range(start_addr, inferior_region_end);
+ const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x4};
+ CoreFileMemoryRanges::Entry inferior_entry = {
+ start_addr, inferior_region_end - start_addr, inferior_core_range};
+ ranges.Append(inferior_entry);
+
+ Status error = ranges.FinalizeCoreFileSaveRanges();
+ EXPECT_TRUE(error.Fail());
+}
+
+TEST(CoreFileMemoryRangesTest, NonIntersectingRangesSamePermissions) {
+ const int permissions = 0x7;
+ lldb_private::CoreFileMemoryRanges ranges;
+ const lldb::addr_t region_one_start = 0x1000;
+ const lldb::addr_t region_one_end = 0x2000;
+ llvm::AddressRange range_one(region_one_start, region_one_end);
+ const CoreFileMemoryRange core_range_one = {range_one, permissions};
+ CoreFileMemoryRanges::Entry entry_one = {
+ region_one_start, region_one_end - region_one_start, core_range_one};
+ ranges.Append(entry_one);
+ const lldb::addr_t region_two_start = 0xb000;
+ const lldb::addr_t region_two_end = 0xc000;
+ llvm::AddressRange range_two(region_two_start, region_two_end);
+ const CoreFileMemoryRange core_range_two = {range_two, permissions};
+ CoreFileMemoryRanges::Entry entry_two = {
+ region_two_start, region_two_end - region_two_start, core_range_two};
+ ranges.Append(entry_two);
+
+ Status error = ranges.FinalizeCoreFileSaveRanges();
+ EXPECT_TRUE(error.Success());
+ ASSERT_THAT(2UL, ranges.GetSize());
+ ASSERT_THAT(region_one_start, ranges.GetEntryAtIndex(0)->GetRangeBase());
+ ASSERT_THAT(region_two_start, ranges.GetEntryAtIndex(1)->GetRangeBase());
+}
+
+TEST(CoreFileMemoryRangesTest, PartialOverlapping) {
+ const int permissions = 0x3;
+ lldb_private::CoreFileMemoryRanges ranges;
+ const lldb::addr_t start_addr = 0x1000;
+ const lldb::addr_t end_addr = 0x2000;
+ llvm::AddressRange range_one(start_addr, end_addr);
+ const CoreFileMemoryRange core_range_one = {range_one, permissions};
+ CoreFileMemoryRanges::Entry entry_one = {start_addr, end_addr - start_addr,
+ core_range_one};
+ llvm::AddressRange range_two(start_addr / 2, end_addr / 2);
+ const CoreFileMemoryRange core_range_two = {range_two, permissions};
+ CoreFileMemoryRanges::Entry entry_two = {
+ start_addr / 2, end_addr / 2 - start_addr / 2, core_range_two};
+ ranges.Append(entry_one);
+ ranges.Append(entry_two);
+
+ Status error = ranges.FinalizeCoreFileSaveRanges();
+ EXPECT_TRUE(error.Success());
+ ASSERT_THAT(1, ranges.GetSize());
+ const auto searched_range = ranges.GetEntryAtIndex(0);
+ ASSERT_TRUE(searched_range);
+ ASSERT_THAT(start_addr / 2, searched_range->GetRangeBase());
+ ASSERT_THAT(end_addr, searched_range->GetRangeEnd());
+}
diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp
index c646ba6..ca2cab6 100644
--- a/llvm/benchmarks/SandboxIRBench.cpp
+++ b/llvm/benchmarks/SandboxIRBench.cpp
@@ -34,15 +34,19 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
}
enum class IR {
- LLVM,
- SBox,
+ LLVM, ///> LLVM IR
+ SBoxNoTracking, ///> Sandbox IR with tracking disabled
+ SBoxTracking, ///> Sandbox IR with tracking enabled
};
// Traits to get llvm::BasicBlock/sandboxir::BasicBlock from IR::LLVM/IR::SBox.
template <IR IRTy> struct TypeSelect {};
template <> struct TypeSelect<IR::LLVM> {
using BasicBlock = llvm::BasicBlock;
};
-template <> struct TypeSelect<IR::SBox> {
+template <> struct TypeSelect<IR::SBoxNoTracking> {
+ using BasicBlock = sandboxir::BasicBlock;
+};
+template <> struct TypeSelect<IR::SBoxTracking> {
using BasicBlock = sandboxir::BasicBlock;
};
@@ -59,12 +63,22 @@ genIR(std::unique_ptr<llvm::Module> &LLVMM, LLVMContext &LLVMCtx,
sandboxir::Function *F = Ctx.createFunction(LLVMF);
sandboxir::BasicBlock *BB = &*F->begin();
+ // Start tracking if we are testing with tracking enabled.
+ if constexpr (IRTy == IR::SBoxTracking)
+ Ctx.save();
+
if constexpr (IRTy == IR::LLVM)
return LLVMBB;
else
return BB;
}
+template <IR IRTy> static void finalize(sandboxir::Context &Ctx) {
+ // Accept changes if we are tracking.
+ if constexpr (IRTy == IR::SBoxTracking)
+ Ctx.accept();
+}
+
static std::string generateBBWalkIR(unsigned Size) {
std::stringstream SS;
SS << "define void @foo(i32 %v1, i32 %v2) {\n";
@@ -132,15 +146,61 @@ template <IR IRTy> static void RAUW(benchmark::State &State) {
Def1->replaceAllUsesWith(Def2);
Def2->replaceAllUsesWith(Def1);
}
+ finalize<IRTy>(Ctx);
+}
+
+static std::string generateRUOWIR(unsigned NumOperands) {
+ std::stringstream SS;
+ auto GenOps = [&SS, NumOperands]() {
+ for (auto Cnt : seq<unsigned>(0, NumOperands)) {
+ SS << "i8 %arg" << Cnt;
+ bool IsLast = Cnt + 1 == NumOperands;
+ if (!IsLast)
+ SS << ", ";
+ }
+ };
+
+ SS << "define void @foo(";
+ GenOps();
+ SS << ") {\n";
+
+ SS << " call void @foo(";
+ GenOps();
+ SS << ")\n";
+ SS << "ret void";
+ SS << "}";
+ return SS.str();
+}
+
+template <IR IRTy> static void RUOW(benchmark::State &State) {
+ LLVMContext LLVMCtx;
+ sandboxir::Context Ctx(LLVMCtx);
+ std::unique_ptr<llvm::Module> LLVMM;
+ unsigned NumOperands = State.range(0);
+ auto *BB = genIR<IRTy>(LLVMM, LLVMCtx, Ctx, generateRUOWIR, NumOperands);
+
+ auto It = BB->begin();
+ auto *F = BB->getParent();
+ auto *Arg0 = F->getArg(0);
+ auto *Arg1 = F->getArg(1);
+ auto *Call = &*It++;
+ for (auto _ : State)
+ Call->replaceUsesOfWith(Arg0, Arg1);
+ finalize<IRTy>(Ctx);
}
BENCHMARK(GetType<IR::LLVM>);
-BENCHMARK(GetType<IR::SBox>);
+BENCHMARK(GetType<IR::SBoxNoTracking>);
BENCHMARK(BBWalk<IR::LLVM>)->Args({1024});
-BENCHMARK(BBWalk<IR::SBox>)->Args({1024});
+BENCHMARK(BBWalk<IR::SBoxTracking>)->Args({1024});
BENCHMARK(RAUW<IR::LLVM>)->Args({512});
-BENCHMARK(RAUW<IR::SBox>)->Args({512});
+BENCHMARK(RAUW<IR::SBoxNoTracking>)->Args({512});
+BENCHMARK(RAUW<IR::SBoxTracking>)->Args({512});
+
+BENCHMARK(RUOW<IR::LLVM>)->Args({4096});
+BENCHMARK(RUOW<IR::SBoxNoTracking>)->Args({4096});
+BENCHMARK(RUOW<IR::SBoxTracking>)->Args({4096});
BENCHMARK_MAIN();
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 00290c9..083d5c9 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -109,7 +109,8 @@ public:
void clear() {
incrementEpoch();
- if (getNumEntries() == 0 && getNumTombstones() == 0) return;
+ if (getNumEntries() == 0 && getNumTombstones() == 0)
+ return;
// If the capacity of the array is huge, and the # elements used is small,
// shrink the array.
@@ -119,7 +120,7 @@ public:
}
const KeyT EmptyKey = getEmptyKey();
- if (std::is_trivially_destructible<ValueT>::value) {
+ if constexpr (std::is_trivially_destructible_v<ValueT>) {
// Use a simpler loop when values don't need destruction.
for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P)
P->getFirst() = EmptyKey;
@@ -172,15 +173,14 @@ public:
/// The DenseMapInfo is responsible for supplying methods
/// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key
/// type used.
- template<class LookupKeyT>
- iterator find_as(const LookupKeyT &Val) {
+ template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) {
if (BucketT *Bucket = doFind(Val))
return makeIterator(
Bucket, shouldReverseIterate<KeyT>() ? getBuckets() : getBucketsEnd(),
*this, true);
return end();
}
- template<class LookupKeyT>
+ template <class LookupKeyT>
const_iterator find_as(const LookupKeyT &Val) const {
if (const BucketT *Bucket = doFind(Val))
return makeConstIterator(
@@ -223,7 +223,7 @@ public:
// The value is constructed in-place if the key is not in the map, otherwise
// it is not moved.
template <typename... Ts>
- std::pair<iterator, bool> try_emplace(KeyT &&Key, Ts &&... Args) {
+ std::pair<iterator, bool> try_emplace(KeyT &&Key, Ts &&...Args) {
BucketT *TheBucket;
if (LookupBucketFor(Key, TheBucket))
return std::make_pair(makeIterator(TheBucket,
@@ -248,7 +248,7 @@ public:
// The value is constructed in-place if the key is not in the map, otherwise
// it is not moved.
template <typename... Ts>
- std::pair<iterator, bool> try_emplace(const KeyT &Key, Ts &&... Args) {
+ std::pair<iterator, bool> try_emplace(const KeyT &Key, Ts &&...Args) {
BucketT *TheBucket;
if (LookupBucketFor(Key, TheBucket))
return std::make_pair(makeIterator(TheBucket,
@@ -297,8 +297,7 @@ public:
}
/// insert - Range insertion of pairs.
- template<typename InputIt>
- void insert(InputIt I, InputIt E) {
+ template <typename InputIt> void insert(InputIt I, InputIt E) {
for (; I != E; ++I)
insert(*I);
}
@@ -420,7 +419,7 @@ protected:
setNumEntries(0);
setNumTombstones(0);
- assert((getNumBuckets() & (getNumBuckets()-1)) == 0 &&
+ assert((getNumBuckets() & (getNumBuckets() - 1)) == 0 &&
"# initial buckets must be a power of two!");
const KeyT EmptyKey = getEmptyKey();
for (BucketT *B = getBuckets(), *E = getBucketsEnd(); B != E; ++B)
@@ -491,7 +490,7 @@ protected:
return KeyInfoT::getHashValue(Val);
}
- template<typename LookupKeyT>
+ template <typename LookupKeyT>
static unsigned getHashValue(const LookupKeyT &Val) {
return KeyInfoT::getHashValue(Val);
}
@@ -502,14 +501,11 @@ protected:
return KeyInfoT::getEmptyKey();
}
- static const KeyT getTombstoneKey() {
- return KeyInfoT::getTombstoneKey();
- }
+ static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
private:
- iterator makeIterator(BucketT *P, BucketT *E,
- DebugEpochBase &Epoch,
- bool NoAdvance=false) {
+ iterator makeIterator(BucketT *P, BucketT *E, DebugEpochBase &Epoch,
+ bool NoAdvance = false) {
if (shouldReverseIterate<KeyT>()) {
BucketT *B = P == getBucketsEnd() ? getBuckets() : P + 1;
return iterator(B, E, Epoch, NoAdvance);
@@ -519,7 +515,7 @@ private:
const_iterator makeConstIterator(const BucketT *P, const BucketT *E,
const DebugEpochBase &Epoch,
- const bool NoAdvance=false) const {
+ const bool NoAdvance = false) const {
if (shouldReverseIterate<KeyT>()) {
const BucketT *B = P == getBucketsEnd() ? getBuckets() : P + 1;
return const_iterator(B, E, Epoch, NoAdvance);
@@ -535,13 +531,9 @@ private:
static_cast<DerivedT *>(this)->setNumEntries(Num);
}
- void incrementNumEntries() {
- setNumEntries(getNumEntries() + 1);
- }
+ void incrementNumEntries() { setNumEntries(getNumEntries() + 1); }
- void decrementNumEntries() {
- setNumEntries(getNumEntries() - 1);
- }
+ void decrementNumEntries() { setNumEntries(getNumEntries() - 1); }
unsigned getNumTombstones() const {
return static_cast<const DerivedT *>(this)->getNumTombstones();
@@ -551,45 +543,33 @@ private:
static_cast<DerivedT *>(this)->setNumTombstones(Num);
}
- void incrementNumTombstones() {
- setNumTombstones(getNumTombstones() + 1);
- }
+ void incrementNumTombstones() { setNumTombstones(getNumTombstones() + 1); }
- void decrementNumTombstones() {
- setNumTombstones(getNumTombstones() - 1);
- }
+ void decrementNumTombstones() { setNumTombstones(getNumTombstones() - 1); }
const BucketT *getBuckets() const {
return static_cast<const DerivedT *>(this)->getBuckets();
}
- BucketT *getBuckets() {
- return static_cast<DerivedT *>(this)->getBuckets();
- }
+ BucketT *getBuckets() { return static_cast<DerivedT *>(this)->getBuckets(); }
unsigned getNumBuckets() const {
return static_cast<const DerivedT *>(this)->getNumBuckets();
}
- BucketT *getBucketsEnd() {
- return getBuckets() + getNumBuckets();
- }
+ BucketT *getBucketsEnd() { return getBuckets() + getNumBuckets(); }
const BucketT *getBucketsEnd() const {
return getBuckets() + getNumBuckets();
}
- void grow(unsigned AtLeast) {
- static_cast<DerivedT *>(this)->grow(AtLeast);
- }
+ void grow(unsigned AtLeast) { static_cast<DerivedT *>(this)->grow(AtLeast); }
- void shrink_and_clear() {
- static_cast<DerivedT *>(this)->shrink_and_clear();
- }
+ void shrink_and_clear() { static_cast<DerivedT *>(this)->shrink_and_clear(); }
template <typename KeyArg, typename... ValueArgs>
BucketT *InsertIntoBucket(BucketT *TheBucket, KeyArg &&Key,
- ValueArgs &&... Values) {
+ ValueArgs &&...Values) {
TheBucket = InsertIntoBucketImpl(Key, Key, TheBucket);
TheBucket->getFirst() = std::forward<KeyArg>(Key);
@@ -627,8 +607,9 @@ private:
this->grow(NumBuckets * 2);
LookupBucketFor(Lookup, TheBucket);
NumBuckets = getNumBuckets();
- } else if (LLVM_UNLIKELY(NumBuckets-(NewNumEntries+getNumTombstones()) <=
- NumBuckets/8)) {
+ } else if (LLVM_UNLIKELY(NumBuckets -
+ (NewNumEntries + getNumTombstones()) <=
+ NumBuckets / 8)) {
this->grow(NumBuckets);
LookupBucketFor(Lookup, TheBucket);
}
@@ -696,7 +677,7 @@ private:
!KeyInfoT::isEqual(Val, TombstoneKey) &&
"Empty/Tombstone value shouldn't be inserted into map!");
- unsigned BucketNo = getHashValue(Val) & (NumBuckets-1);
+ unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
unsigned ProbeAmt = 1;
while (true) {
BucketT *ThisBucket = BucketsPtr + BucketNo;
@@ -719,12 +700,12 @@ private:
// prefer to return it than something that would require more probing.
if (KeyInfoT::isEqual(ThisBucket->getFirst(), TombstoneKey) &&
!FoundTombstone)
- FoundTombstone = ThisBucket; // Remember the first tombstone found.
+ FoundTombstone = ThisBucket; // Remember the first tombstone found.
// Otherwise, it's a hash collision or a tombstone, continue quadratic
// probing.
BucketNo += ProbeAmt++;
- BucketNo &= (NumBuckets-1);
+ BucketNo &= (NumBuckets - 1);
}
}
@@ -733,9 +714,7 @@ public:
/// This is just the raw memory used by DenseMap.
/// If entries are pointers to objects, the size of the referenced objects
/// are not included.
- size_t getMemorySize() const {
- return getNumBuckets() * sizeof(BucketT);
- }
+ size_t getMemorySize() const { return getNumBuckets() * sizeof(BucketT); }
};
/// Equality comparison for DenseMap.
@@ -803,8 +782,7 @@ public:
swap(other);
}
- template<typename InputIt>
- DenseMap(const InputIt &I, const InputIt &E) {
+ template <typename InputIt> DenseMap(const InputIt &I, const InputIt &E) {
init(std::distance(I, E));
this->insert(I, E);
}
@@ -819,7 +797,7 @@ public:
deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
}
- void swap(DenseMap& RHS) {
+ void swap(DenseMap &RHS) {
this->incrementEpoch();
RHS.incrementEpoch();
std::swap(Buckets, RHS.Buckets);
@@ -828,13 +806,13 @@ public:
std::swap(NumBuckets, RHS.NumBuckets);
}
- DenseMap& operator=(const DenseMap& other) {
+ DenseMap &operator=(const DenseMap &other) {
if (&other != this)
copyFrom(other);
return *this;
}
- DenseMap& operator=(DenseMap &&other) {
+ DenseMap &operator=(DenseMap &&other) {
this->destroyAll();
deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
init(0);
@@ -842,7 +820,7 @@ public:
return *this;
}
- void copyFrom(const DenseMap& other) {
+ void copyFrom(const DenseMap &other) {
this->destroyAll();
deallocate_buffer(Buckets, sizeof(BucketT) * NumBuckets, alignof(BucketT));
if (allocateBuckets(other.NumBuckets)) {
@@ -867,14 +845,15 @@ public:
unsigned OldNumBuckets = NumBuckets;
BucketT *OldBuckets = Buckets;
- allocateBuckets(std::max<unsigned>(64, static_cast<unsigned>(NextPowerOf2(AtLeast-1))));
+ allocateBuckets(std::max<unsigned>(
+ 64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
assert(Buckets);
if (!OldBuckets) {
this->BaseT::initEmpty();
return;
}
- this->moveFromOldBuckets(OldBuckets, OldBuckets+OldNumBuckets);
+ this->moveFromOldBuckets(OldBuckets, OldBuckets + OldNumBuckets);
// Free the old table.
deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
@@ -901,29 +880,17 @@ public:
}
private:
- unsigned getNumEntries() const {
- return NumEntries;
- }
+ unsigned getNumEntries() const { return NumEntries; }
- void setNumEntries(unsigned Num) {
- NumEntries = Num;
- }
+ void setNumEntries(unsigned Num) { NumEntries = Num; }
- unsigned getNumTombstones() const {
- return NumTombstones;
- }
+ unsigned getNumTombstones() const { return NumTombstones; }
- void setNumTombstones(unsigned Num) {
- NumTombstones = Num;
- }
+ void setNumTombstones(unsigned Num) { NumTombstones = Num; }
- BucketT *getBuckets() const {
- return Buckets;
- }
+ BucketT *getBuckets() const { return Buckets; }
- unsigned getNumBuckets() const {
- return NumBuckets;
- }
+ unsigned getNumBuckets() const { return NumBuckets; }
bool allocateBuckets(unsigned Num) {
NumBuckets = Num;
@@ -984,7 +951,7 @@ public:
swap(other);
}
- template<typename InputIt>
+ template <typename InputIt>
SmallDenseMap(const InputIt &I, const InputIt &E) {
init(NextPowerOf2(std::distance(I, E)));
this->insert(I, E);
@@ -998,7 +965,7 @@ public:
deallocateBuckets();
}
- void swap(SmallDenseMap& RHS) {
+ void swap(SmallDenseMap &RHS) {
unsigned TmpNumEntries = RHS.NumEntries;
RHS.NumEntries = NumEntries;
NumEntries = TmpNumEntries;
@@ -1070,13 +1037,13 @@ public:
new (SmallSide.getLargeRep()) LargeRep(std::move(TmpRep));
}
- SmallDenseMap& operator=(const SmallDenseMap& other) {
+ SmallDenseMap &operator=(const SmallDenseMap &other) {
if (&other != this)
copyFrom(other);
return *this;
}
- SmallDenseMap& operator=(SmallDenseMap &&other) {
+ SmallDenseMap &operator=(SmallDenseMap &&other) {
this->destroyAll();
deallocateBuckets();
init(0);
@@ -1084,7 +1051,7 @@ public:
return *this;
}
- void copyFrom(const SmallDenseMap& other) {
+ void copyFrom(const SmallDenseMap &other) {
this->destroyAll();
deallocateBuckets();
Small = true;
@@ -1106,7 +1073,7 @@ public:
void grow(unsigned AtLeast) {
if (AtLeast > InlineBuckets)
- AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast-1));
+ AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
if (Small) {
// First move the inline buckets into a temporary storage.
@@ -1150,7 +1117,8 @@ public:
new (getLargeRep()) LargeRep(allocateBuckets(AtLeast));
}
- this->moveFromOldBuckets(OldRep.Buckets, OldRep.Buckets+OldRep.NumBuckets);
+ this->moveFromOldBuckets(OldRep.Buckets,
+ OldRep.Buckets + OldRep.NumBuckets);
// Free the old table.
deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
@@ -1179,9 +1147,7 @@ public:
}
private:
- unsigned getNumEntries() const {
- return NumEntries;
- }
+ unsigned getNumEntries() const { return NumEntries; }
void setNumEntries(unsigned Num) {
// NumEntries is hardcoded to be 31 bits wide.
@@ -1189,13 +1155,9 @@ private:
NumEntries = Num;
}
- unsigned getNumTombstones() const {
- return NumTombstones;
- }
+ unsigned getNumTombstones() const { return NumTombstones; }
- void setNumTombstones(unsigned Num) {
- NumTombstones = Num;
- }
+ void setNumTombstones(unsigned Num) { NumTombstones = Num; }
const BucketT *getInlineBuckets() const {
assert(Small);
@@ -1207,7 +1169,7 @@ private:
BucketT *getInlineBuckets() {
return const_cast<BucketT *>(
- const_cast<const SmallDenseMap *>(this)->getInlineBuckets());
+ const_cast<const SmallDenseMap *>(this)->getInlineBuckets());
}
const LargeRep *getLargeRep() const {
@@ -1218,7 +1180,7 @@ private:
LargeRep *getLargeRep() {
return const_cast<LargeRep *>(
- const_cast<const SmallDenseMap *>(this)->getLargeRep());
+ const_cast<const SmallDenseMap *>(this)->getLargeRep());
}
const BucketT *getBuckets() const {
@@ -1227,7 +1189,7 @@ private:
BucketT *getBuckets() {
return const_cast<BucketT *>(
- const_cast<const SmallDenseMap *>(this)->getBuckets());
+ const_cast<const SmallDenseMap *>(this)->getBuckets());
}
unsigned getNumBuckets() const {
@@ -1278,7 +1240,8 @@ public:
: DebugEpochBase::HandleBase(&Epoch), Ptr(Pos), End(E) {
assert(isHandleInSync() && "invalid construction!");
- if (NoAdvance) return;
+ if (NoAdvance)
+ return;
if (shouldReverseIterate<KeyT>()) {
RetreatPastEmptyBuckets();
return;
@@ -1324,7 +1287,7 @@ public:
return !(LHS == RHS);
}
- inline DenseMapIterator& operator++() { // Preincrement
+ inline DenseMapIterator &operator++() { // Preincrement
assert(isHandleInSync() && "invalid iterator access!");
assert(Ptr != End && "incrementing end() iterator");
if (shouldReverseIterate<KeyT>()) {
@@ -1336,9 +1299,11 @@ public:
AdvancePastEmptyBuckets();
return *this;
}
- DenseMapIterator operator++(int) { // Postincrement
+ DenseMapIterator operator++(int) { // Postincrement
assert(isHandleInSync() && "invalid iterator access!");
- DenseMapIterator tmp = *this; ++*this; return tmp;
+ DenseMapIterator tmp = *this;
+ ++*this;
+ return tmp;
}
private:
diff --git a/llvm/include/llvm/ADT/DenseSet.h b/llvm/include/llvm/ADT/DenseSet.h
index b89c886..a307bd8 100644
--- a/llvm/include/llvm/ADT/DenseSet.h
+++ b/llvm/include/llvm/ADT/DenseSet.h
@@ -89,18 +89,12 @@ public:
/// before resizing again.
void reserve(size_t Size) { TheMap.reserve(Size); }
- void clear() {
- TheMap.clear();
- }
+ void clear() { TheMap.clear(); }
/// Return 1 if the specified key is in the set, 0 otherwise.
- size_type count(const_arg_type_t<ValueT> V) const {
- return TheMap.count(V);
- }
+ size_type count(const_arg_type_t<ValueT> V) const { return TheMap.count(V); }
- bool erase(const ValueT &V) {
- return TheMap.erase(V);
- }
+ bool erase(const ValueT &V) { return TheMap.erase(V); }
void swap(DenseSetImpl &RHS) { TheMap.swap(RHS.TheMap); }
@@ -128,8 +122,15 @@ public:
ValueT *operator->() { return &I->getFirst(); }
const ValueT *operator->() const { return &I->getFirst(); }
- Iterator& operator++() { ++I; return *this; }
- Iterator operator++(int) { auto T = *this; ++I; return T; }
+ Iterator &operator++() {
+ ++I;
+ return *this;
+ }
+ Iterator operator++(int) {
+ auto T = *this;
+ ++I;
+ return T;
+ }
friend bool operator==(const Iterator &X, const Iterator &Y) {
return X.I == Y.I;
}
@@ -157,8 +158,15 @@ public:
const ValueT &operator*() const { return I->getFirst(); }
const ValueT *operator->() const { return &I->getFirst(); }
- ConstIterator& operator++() { ++I; return *this; }
- ConstIterator operator++(int) { auto T = *this; ++I; return T; }
+ ConstIterator &operator++() {
+ ++I;
+ return *this;
+ }
+ ConstIterator operator++(int) {
+ auto T = *this;
+ ++I;
+ return T;
+ }
friend bool operator==(const ConstIterator &X, const ConstIterator &Y) {
return X.I == Y.I;
}
@@ -191,8 +199,7 @@ public:
/// The DenseMapInfo is responsible for supplying methods
/// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key type
/// used.
- template <class LookupKeyT>
- iterator find_as(const LookupKeyT &Val) {
+ template <class LookupKeyT> iterator find_as(const LookupKeyT &Val) {
return Iterator(TheMap.find_as(Val));
}
template <class LookupKeyT>
@@ -226,8 +233,7 @@ public:
}
// Range insertion of values.
- template<typename InputIt>
- void insert(InputIt I, InputIt E) {
+ template <typename InputIt> void insert(InputIt I, InputIt E) {
for (; I != E; ++I)
insert(*I);
}
@@ -266,8 +272,9 @@ bool operator!=(const DenseSetImpl<ValueT, MapTy, ValueInfoT> &LHS,
/// Implements a dense probed hash-table based set.
template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
class DenseSet : public detail::DenseSetImpl<
- ValueT, DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT,
- detail::DenseSetPair<ValueT>>,
+ ValueT,
+ DenseMap<ValueT, detail::DenseSetEmpty, ValueInfoT,
+ detail::DenseSetPair<ValueT>>,
ValueInfoT> {
using BaseT =
detail::DenseSetImpl<ValueT,
@@ -285,12 +292,14 @@ template <typename ValueT, unsigned InlineBuckets = 4,
typename ValueInfoT = DenseMapInfo<ValueT>>
class SmallDenseSet
: public detail::DenseSetImpl<
- ValueT, SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets,
- ValueInfoT, detail::DenseSetPair<ValueT>>,
+ ValueT,
+ SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets,
+ ValueInfoT, detail::DenseSetPair<ValueT>>,
ValueInfoT> {
using BaseT = detail::DenseSetImpl<
- ValueT, SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets,
- ValueInfoT, detail::DenseSetPair<ValueT>>,
+ ValueT,
+ SmallDenseMap<ValueT, detail::DenseSetEmpty, InlineBuckets, ValueInfoT,
+ detail::DenseSetPair<ValueT>>,
ValueInfoT>;
public:
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index c5eff15..09fc34a 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -218,8 +218,8 @@ protected:
inline unsigned getID() const;
MemoryAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue,
- BasicBlock *BB, unsigned NumOperands)
- : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue),
+ BasicBlock *BB, AllocInfo AllocInfo)
+ : DerivedUser(Type::getVoidTy(C), Vty, AllocInfo, DeleteValue),
Block(BB) {}
// Use deleteValue() to delete a generic MemoryAccess.
@@ -280,8 +280,8 @@ protected:
MemoryUseOrDef(LLVMContext &C, MemoryAccess *DMA, unsigned Vty,
DeleteValueTy DeleteValue, Instruction *MI, BasicBlock *BB,
- unsigned NumOperands)
- : MemoryAccess(C, Vty, DeleteValue, BB, NumOperands),
+ AllocInfo AllocInfo)
+ : MemoryAccess(C, Vty, DeleteValue, BB, AllocInfo),
MemoryInstruction(MI) {
setDefiningAccess(DMA);
}
@@ -307,15 +307,16 @@ private:
/// MemoryUse's is exactly the set of Instructions for which
/// AliasAnalysis::getModRefInfo returns "Ref".
class MemoryUse final : public MemoryUseOrDef {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
public:
DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
MemoryUse(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB)
- : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB,
- /*NumOperands=*/1) {}
+ : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB, AllocMarker) {}
// allocate space for exactly one operand
- void *operator new(size_t S) { return User::operator new(S, 1); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
static bool classof(const Value *MA) {
@@ -367,6 +368,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUse, MemoryAccess)
/// associated with them. This use points to the nearest reaching
/// MemoryDef/MemoryPhi.
class MemoryDef final : public MemoryUseOrDef {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
public:
friend class MemorySSA;
@@ -374,12 +377,11 @@ public:
MemoryDef(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB,
unsigned Ver)
- : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB,
- /*NumOperands=*/2),
+ : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB, AllocMarker),
ID(Ver) {}
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
static bool classof(const Value *MA) {
@@ -474,8 +476,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUseOrDef, MemoryAccess)
/// Because MemoryUse's do not generate new definitions, they do not have this
/// issue.
class MemoryPhi final : public MemoryAccess {
+ constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
// allocate space for exactly zero operands
- void *operator new(size_t S) { return User::operator new(S); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
public:
void operator delete(void *Ptr) { User::operator delete(Ptr); }
@@ -484,7 +488,7 @@ public:
DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess);
MemoryPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0)
- : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, 0), ID(Ver),
+ : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, AllocMarker), ID(Ver),
ReservedSpace(NumPreds) {
allocHungoffUses(ReservedSpace);
}
diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h
index a82e37b..0aefb5e 100644
--- a/llvm/include/llvm/IR/Constant.h
+++ b/llvm/include/llvm/IR/Constant.h
@@ -41,8 +41,8 @@ class APInt;
/// LLVM Constant Representation
class Constant : public User {
protected:
- Constant(Type *ty, ValueTy vty, Use *Ops, unsigned NumOps)
- : User(ty, vty, Ops, NumOps) {}
+ Constant(Type *ty, ValueTy vty, AllocInfo AllocInfo)
+ : User(ty, vty, AllocInfo) {}
~Constant() = default;
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index 62ccde9..3b16aa0 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -51,6 +51,8 @@ template <class ConstantClass> struct ConstantAggrKeyType;
/// Since they can be in use by unrelated modules (and are never based on
/// GlobalValues), it never makes sense to RAUW them.
class ConstantData : public Constant {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{0};
+
friend class Constant;
Value *handleOperandChangeImpl(Value *From, Value *To) {
@@ -58,9 +60,9 @@ class ConstantData : public Constant {
}
protected:
- explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, nullptr, 0) {}
+ explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, AllocMarker) {}
- void *operator new(size_t S) { return User::operator new(S, 0); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
public:
void operator delete(void *Ptr) { User::operator delete(Ptr); }
@@ -399,7 +401,8 @@ public:
/// use operands.
class ConstantAggregate : public Constant {
protected:
- ConstantAggregate(Type *T, ValueTy VT, ArrayRef<Constant *> V);
+ ConstantAggregate(Type *T, ValueTy VT, ArrayRef<Constant *> V,
+ AllocInfo AllocInfo);
public:
/// Transparently provide more efficient getOperand methods.
@@ -425,7 +428,7 @@ class ConstantArray final : public ConstantAggregate {
friend struct ConstantAggrKeyType<ConstantArray>;
friend class Constant;
- ConstantArray(ArrayType *T, ArrayRef<Constant *> Val);
+ ConstantArray(ArrayType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo);
void destroyConstantImpl();
Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -457,7 +460,7 @@ class ConstantStruct final : public ConstantAggregate {
friend struct ConstantAggrKeyType<ConstantStruct>;
friend class Constant;
- ConstantStruct(StructType *T, ArrayRef<Constant *> Val);
+ ConstantStruct(StructType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo);
void destroyConstantImpl();
Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -509,7 +512,7 @@ class ConstantVector final : public ConstantAggregate {
friend struct ConstantAggrKeyType<ConstantVector>;
friend class Constant;
- ConstantVector(VectorType *T, ArrayRef<Constant *> Val);
+ ConstantVector(VectorType *T, ArrayRef<Constant *> Val, AllocInfo AllocInfo);
void destroyConstantImpl();
Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -890,9 +893,11 @@ public:
class BlockAddress final : public Constant {
friend class Constant;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
BlockAddress(Function *F, BasicBlock *BB);
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void destroyConstantImpl();
Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -936,9 +941,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BlockAddress, Value)
class DSOLocalEquivalent final : public Constant {
friend class Constant;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
DSOLocalEquivalent(GlobalValue *GV);
- void *operator new(size_t S) { return User::operator new(S, 1); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void destroyConstantImpl();
Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -973,9 +980,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DSOLocalEquivalent, Value)
class NoCFIValue final : public Constant {
friend class Constant;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
NoCFIValue(GlobalValue *GV);
- void *operator new(size_t S) { return User::operator new(S, 1); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void destroyConstantImpl();
Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -1013,10 +1022,12 @@ class ConstantPtrAuth final : public Constant {
friend struct ConstantPtrAuthKeyType;
friend class Constant;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{4};
+
ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc,
Constant *AddrDisc);
- void *operator new(size_t s) { return User::operator new(s, 4); }
+ void *operator new(size_t s) { return User::operator new(s, AllocMarker); }
void destroyConstantImpl();
Value *handleOperandChangeImpl(Value *From, Value *To);
@@ -1102,8 +1113,8 @@ class ConstantExpr : public Constant {
Value *handleOperandChangeImpl(Value *From, Value *To);
protected:
- ConstantExpr(Type *ty, unsigned Opcode, Use *Ops, unsigned NumOps)
- : Constant(ty, ConstantExprVal, Ops, NumOps) {
+ ConstantExpr(Type *ty, unsigned Opcode, AllocInfo AllocInfo)
+ : Constant(ty, ConstantExprVal, AllocInfo) {
// Operation type (an Instruction opcode) is stored as the SubclassData.
setValueSubclassData(Opcode);
}
diff --git a/llvm/include/llvm/IR/DerivedUser.h b/llvm/include/llvm/IR/DerivedUser.h
index a25d316..a307315 100644
--- a/llvm/include/llvm/IR/DerivedUser.h
+++ b/llvm/include/llvm/IR/DerivedUser.h
@@ -34,9 +34,9 @@ private:
DeleteValueTy DeleteValue;
public:
- DerivedUser(Type *Ty, unsigned VK, Use *U, unsigned NumOps,
+ DerivedUser(Type *Ty, unsigned VK, AllocInfo AllocInfo,
DeleteValueTy DeleteValue)
- : User(Ty, VK, U, NumOps), DeleteValue(DeleteValue) {}
+ : User(Ty, VK, AllocInfo), DeleteValue(DeleteValue) {}
};
} // end namespace llvm
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index f7e4e97..866c68d 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -72,6 +72,8 @@ public:
using const_arg_iterator = const Argument *;
private:
+ constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
// Important things that make up a function!
BasicBlockListType BasicBlocks; ///< The basic blocks
@@ -171,13 +173,14 @@ public:
static Function *Create(FunctionType *Ty, LinkageTypes Linkage,
unsigned AddrSpace, const Twine &N = "",
Module *M = nullptr) {
- return new Function(Ty, Linkage, AddrSpace, N, M);
+ return new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M);
}
// TODO: remove this once all users have been updated to pass an AddrSpace
static Function *Create(FunctionType *Ty, LinkageTypes Linkage,
const Twine &N = "", Module *M = nullptr) {
- return new Function(Ty, Linkage, static_cast<unsigned>(-1), N, M);
+ return new (AllocMarker)
+ Function(Ty, Linkage, static_cast<unsigned>(-1), N, M);
}
/// Creates a new function and attaches it to a module.
diff --git a/llvm/include/llvm/IR/GlobalAlias.h b/llvm/include/llvm/IR/GlobalAlias.h
index 583d66e..3db6984 100644
--- a/llvm/include/llvm/IR/GlobalAlias.h
+++ b/llvm/include/llvm/IR/GlobalAlias.h
@@ -28,6 +28,8 @@ template <typename ValueSubClass, typename... Args> class SymbolTableListTraits;
class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
friend class SymbolTableListTraits<GlobalAlias>;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
const Twine &Name, Constant *Aliasee, Module *Parent);
@@ -59,7 +61,7 @@ public:
static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee);
// allocate space for exactly one operand
- void *operator new(size_t S) { return User::operator new(S, 1); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Provide fast operand accessors
diff --git a/llvm/include/llvm/IR/GlobalIFunc.h b/llvm/include/llvm/IR/GlobalIFunc.h
index 8935284..0d2f152 100644
--- a/llvm/include/llvm/IR/GlobalIFunc.h
+++ b/llvm/include/llvm/IR/GlobalIFunc.h
@@ -34,6 +34,8 @@ template <typename ValueSubClass, typename... Args> class SymbolTableListTraits;
class GlobalIFunc final : public GlobalObject, public ilist_node<GlobalIFunc> {
friend class SymbolTableListTraits<GlobalIFunc>;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage,
const Twine &Name, Constant *Resolver, Module *Parent);
@@ -48,7 +50,7 @@ public:
Constant *Resolver, Module *Parent);
// allocate space for exactly one operand
- void *operator new(size_t S) { return User::operator new(S, 1); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Provide fast operand accessors
diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h
index b6a974d..08edc13 100644
--- a/llvm/include/llvm/IR/GlobalObject.h
+++ b/llvm/include/llvm/IR/GlobalObject.h
@@ -40,10 +40,9 @@ public:
};
protected:
- GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
- LinkageTypes Linkage, const Twine &Name,
- unsigned AddressSpace = 0)
- : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name, AddressSpace) {
+ GlobalObject(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage,
+ const Twine &Name, unsigned AddressSpace = 0)
+ : GlobalValue(Ty, VTy, AllocInfo, Linkage, Name, AddressSpace) {
setGlobalValueSubClassData(0);
}
~GlobalObject();
diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index 53eddeb..d9104d7 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -77,9 +77,9 @@ public:
};
protected:
- GlobalValue(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps,
- LinkageTypes Linkage, const Twine &Name, unsigned AddressSpace)
- : Constant(PointerType::get(Ty, AddressSpace), VTy, Ops, NumOps),
+ GlobalValue(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage,
+ const Twine &Name, unsigned AddressSpace)
+ : Constant(PointerType::get(Ty, AddressSpace), VTy, AllocInfo),
ValueType(Ty), Visibility(DefaultVisibility),
UnnamedAddrVal(unsigned(UnnamedAddr::None)),
DllStorageClass(DefaultStorageClass), ThreadLocal(NotThreadLocal),
diff --git a/llvm/include/llvm/IR/GlobalVariable.h b/llvm/include/llvm/IR/GlobalVariable.h
index 0736c300..83e4848 100644
--- a/llvm/include/llvm/IR/GlobalVariable.h
+++ b/llvm/include/llvm/IR/GlobalVariable.h
@@ -39,6 +39,8 @@ class DIGlobalVariableExpression;
class GlobalVariable : public GlobalObject, public ilist_node<GlobalVariable> {
friend class SymbolTableListTraits<GlobalVariable>;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
AttributeSet Attrs;
// Is this a global constant?
@@ -70,24 +72,31 @@ public:
GlobalVariable(const GlobalVariable &) = delete;
GlobalVariable &operator=(const GlobalVariable &) = delete;
+private:
+ /// Set the number of operands on a GlobalVariable.
+ ///
+ /// GlobalVariable always allocates space for a single operands, but
+ /// doesn't always use it.
+ void setGlobalVariableNumOperands(unsigned NumOps) {
+ assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands");
+ NumUserOperands = NumOps;
+ }
+
+public:
~GlobalVariable() {
dropAllReferences();
+
+ // Number of operands can be set to 0 after construction and initialization.
+ // Make sure that number of operands is reset to 1, as this is needed in
+ // User::operator delete
+ setGlobalVariableNumOperands(1);
}
// allocate space for exactly one operand
- void *operator new(size_t s) {
- return User::operator new(s, 1);
- }
+ void *operator new(size_t s) { return User::operator new(s, AllocMarker); }
// delete space for exactly one operand as created in the corresponding new operator
- void operator delete(void *ptr){
- assert(ptr != nullptr && "must not be nullptr");
- User *Obj = static_cast<User *>(ptr);
- // Number of operands can be set to 0 after construction and initialization. Make sure
- // that number of operands is reset to 1, as this is needed in User::operator delete
- Obj->setGlobalVariableNumOperands(1);
- User::operator delete(Obj);
- }
+ void operator delete(void *ptr) { User::operator delete(ptr); }
/// Provide fast operand accessors
DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 5ed3ec4..4720533 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -55,24 +55,26 @@ typedef unsigned ID;
//===----------------------------------------------------------------------===//
class UnaryInstruction : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
protected:
UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock::iterator IB)
- : Instruction(Ty, iType, &Op<0>(), 1, IB) {
+ : Instruction(Ty, iType, AllocMarker, IB) {
Op<0>() = V;
}
UnaryInstruction(Type *Ty, unsigned iType, Value *V,
Instruction *IB = nullptr)
- : Instruction(Ty, iType, &Op<0>(), 1, IB) {
+ : Instruction(Ty, iType, AllocMarker, IB) {
Op<0>() = V;
}
UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock *IAE)
- : Instruction(Ty, iType, &Op<0>(), 1, IAE) {
+ : Instruction(Ty, iType, AllocMarker, IAE) {
Op<0>() = V;
}
public:
// allocate space for exactly one operand
- void *operator new(size_t S) { return User::operator new(S, 1); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Transparently provide more efficient getOperand methods.
@@ -186,6 +188,8 @@ public:
//===----------------------------------------------------------------------===//
class BinaryOperator : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
void AssertOK();
protected:
@@ -199,7 +203,7 @@ protected:
public:
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Transparently provide more efficient getOperand methods.
@@ -745,6 +749,8 @@ public:
/// This class is the base class for the comparison instructions.
/// Abstract base class of comparison instructions.
class CmpInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
public:
/// This enumeration lists the possible predicates for CmpInst subclasses.
/// Values in the range 0-31 are reserved for FCmpInst, while values in the
@@ -814,7 +820,7 @@ protected:
public:
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Construct a compare instruction, given the opcode, the predicate and
@@ -2416,10 +2422,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value)
//===----------------------------------------------------------------------===//
class FuncletPadInst : public Instruction {
private:
- FuncletPadInst(const FuncletPadInst &CPI);
+ FuncletPadInst(const FuncletPadInst &CPI, AllocInfo AllocInfo);
explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
- ArrayRef<Value *> Args, unsigned Values,
+ ArrayRef<Value *> Args, AllocInfo AllocInfo,
const Twine &NameStr, InsertPosition InsertBefore);
void init(Value *ParentPad, ArrayRef<Value *> Args, const Twine &NameStr);
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index c275723..a12d5d9 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -1030,7 +1030,7 @@ protected:
setValueSubclassData(Storage);
}
- Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps,
+ Instruction(Type *Ty, unsigned iType, AllocInfo AllocInfo,
InsertPosition InsertBefore = nullptr);
private:
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index ab3321e..e89739a 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -297,6 +297,8 @@ class StoreInst : public Instruction {
void AssertOK();
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
protected:
// Note: Instruction needs to be a friend here to call cloneImpl.
friend class Instruction;
@@ -314,7 +316,7 @@ public:
InsertPosition InsertBefore = nullptr);
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Return true if this is a store to a volatile memory location.
@@ -420,6 +422,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(StoreInst, Value)
class FenceInst : public Instruction {
using OrderingField = AtomicOrderingBitfieldElementT<0>;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{0};
+
void Init(AtomicOrdering Ordering, SyncScope::ID SSID);
protected:
@@ -436,7 +440,7 @@ public:
InsertPosition InsertBefore = nullptr);
// allocate space for exactly zero operands
- void *operator new(size_t S) { return User::operator new(S, 0); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Returns the ordering constraint of this fence instruction.
@@ -502,6 +506,8 @@ class AtomicCmpXchgInst : public Instruction {
typename Bitfield::Element<AtomicOrdering, Offset, 3,
AtomicOrdering::LAST>;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
+
protected:
// Note: Instruction needs to be a friend here to call cloneImpl.
friend class Instruction;
@@ -515,7 +521,7 @@ public:
InsertPosition InsertBefore = nullptr);
// allocate space for exactly three operands
- void *operator new(size_t S) { return User::operator new(S, 3); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
using VolatileField = BoolBitfieldElementT<0>;
@@ -774,13 +780,15 @@ private:
using BinOpBitfieldElement =
typename Bitfield::Element<BinOp, Offset, 5, BinOp::LAST_BINOP>;
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
public:
AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment,
AtomicOrdering Ordering, SyncScope::ID SSID,
InsertPosition InsertBefore = nullptr);
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
using VolatileField = BoolBitfieldElementT<0>;
@@ -924,14 +932,14 @@ class GetElementPtrInst : public Instruction {
Type *SourceElementType;
Type *ResultElementType;
- GetElementPtrInst(const GetElementPtrInst &GEPI);
+ GetElementPtrInst(const GetElementPtrInst &GEPI, AllocInfo AllocInfo);
/// Constructors - Create a getelementptr instruction with a base pointer an
/// list of indices. The first and second ctor can optionally insert before an
/// existing instruction, the third appends the new instruction to the
/// specified BasicBlock.
inline GetElementPtrInst(Type *PointeeType, Value *Ptr,
- ArrayRef<Value *> IdxList, unsigned Values,
+ ArrayRef<Value *> IdxList, AllocInfo AllocInfo,
const Twine &NameStr, InsertPosition InsertBefore);
void init(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &NameStr);
@@ -949,8 +957,9 @@ public:
InsertPosition InsertBefore = nullptr) {
unsigned Values = 1 + unsigned(IdxList.size());
assert(PointeeType && "Must specify element type");
- return new (Values) GetElementPtrInst(PointeeType, Ptr, IdxList, Values,
- NameStr, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{Values};
+ return new (AllocMarker) GetElementPtrInst(
+ PointeeType, Ptr, IdxList, AllocMarker, NameStr, InsertBefore);
}
static GetElementPtrInst *Create(Type *PointeeType, Value *Ptr,
@@ -1124,12 +1133,11 @@ struct OperandTraits<GetElementPtrInst>
: public VariadicOperandTraits<GetElementPtrInst> {};
GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr,
- ArrayRef<Value *> IdxList, unsigned Values,
- const Twine &NameStr,
+ ArrayRef<Value *> IdxList,
+ AllocInfo AllocInfo, const Twine &NameStr,
InsertPosition InsertBefore)
- : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr,
- OperandTraits<GetElementPtrInst>::op_end(this) - Values,
- Values, InsertBefore),
+ : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr, AllocInfo,
+ InsertBefore),
SourceElementType(PointeeType),
ResultElementType(getIndexedType(PointeeType, IdxList)) {
init(Ptr, IdxList, NameStr);
@@ -1403,26 +1411,29 @@ public:
/// hold the calling convention of the call.
///
class CallInst : public CallBase {
- CallInst(const CallInst &CI);
+ CallInst(const CallInst &CI, AllocInfo AllocInfo);
/// Construct a CallInst from a range of arguments
inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
- InsertPosition InsertBefore);
+ AllocInfo AllocInfo, InsertPosition InsertBefore);
inline CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
- const Twine &NameStr, InsertPosition InsertBefore)
- : CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore) {}
+ const Twine &NameStr, AllocInfo AllocInfo,
+ InsertPosition InsertBefore)
+ : CallInst(Ty, Func, Args, std::nullopt, NameStr, AllocInfo,
+ InsertBefore) {}
explicit CallInst(FunctionType *Ty, Value *F, const Twine &NameStr,
- InsertPosition InsertBefore);
+ AllocInfo AllocInfo, InsertPosition InsertBefore);
void init(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
void init(FunctionType *FTy, Value *Func, const Twine &NameStr);
/// Compute the number of operands to allocate.
- static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) {
+ static unsigned ComputeNumOperands(unsigned NumArgs,
+ unsigned NumBundleInputs = 0) {
// We need one operand for the called function, plus the input operand
// counts provided.
return 1 + NumArgs + NumBundleInputs;
@@ -1437,26 +1448,29 @@ protected:
public:
static CallInst *Create(FunctionType *Ty, Value *F, const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- return new (ComputeNumOperands(0)) CallInst(Ty, F, NameStr, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(0)};
+ return new (AllocMarker)
+ CallInst(Ty, F, NameStr, AllocMarker, InsertBefore);
}
static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
const Twine &NameStr,
InsertPosition InsertBefore = nullptr) {
- return new (ComputeNumOperands(Args.size()))
- CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(Args.size())};
+ return new (AllocMarker) CallInst(Ty, Func, Args, std::nullopt, NameStr,
+ AllocMarker, InsertBefore);
}
static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles = std::nullopt,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- const int NumOperands =
- ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
- const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+ IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+ ComputeNumOperands(unsigned(Args.size()), CountBundleInputs(Bundles)),
+ unsigned(Bundles.size() * sizeof(BundleOpInfo))};
- return new (NumOperands, DescriptorBytes)
- CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore);
+ return new (AllocMarker)
+ CallInst(Ty, Func, Args, Bundles, NameStr, AllocMarker, InsertBefore);
}
static CallInst *Create(FunctionCallee Func, const Twine &NameStr = "",
@@ -1561,12 +1575,11 @@ private:
CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr,
- InsertPosition InsertBefore)
- : CallBase(Ty->getReturnType(), Instruction::Call,
- OperandTraits<CallBase>::op_end(this) -
- (Args.size() + CountBundleInputs(Bundles) + 1),
- unsigned(Args.size() + CountBundleInputs(Bundles) + 1),
+ AllocInfo AllocInfo, InsertPosition InsertBefore)
+ : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo,
InsertBefore) {
+ assert(AllocInfo.NumOps ==
+ unsigned(Args.size() + CountBundleInputs(Bundles) + 1));
init(Ty, Func, Args, Bundles, NameStr);
}
@@ -1577,10 +1590,11 @@ CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef<Value *> Args,
/// This class represents the LLVM 'select' instruction.
///
class SelectInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr,
InsertPosition InsertBefore)
- : Instruction(S1->getType(), Instruction::Select, &Op<0>(), 3,
+ : Instruction(S1->getType(), Instruction::Select, AllocMarker,
InsertBefore) {
init(C, S1, S2);
setName(NameStr);
@@ -1604,7 +1618,8 @@ public:
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr,
Instruction *MDFrom = nullptr) {
- SelectInst *Sel = new(3) SelectInst(C, S1, S2, NameStr, InsertBefore);
+ SelectInst *Sel =
+ new (AllocMarker) SelectInst(C, S1, S2, NameStr, InsertBefore);
if (MDFrom)
Sel->copyMetadata(*MDFrom);
return Sel;
@@ -1693,6 +1708,8 @@ public:
/// element from a VectorType value
///
class ExtractElementInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr);
@@ -1706,7 +1723,8 @@ public:
static ExtractElementInst *Create(Value *Vec, Value *Idx,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- return new(2) ExtractElementInst(Vec, Idx, NameStr, InsertBefore);
+ return new (AllocMarker)
+ ExtractElementInst(Vec, Idx, NameStr, InsertBefore);
}
/// Return true if an extractelement instruction can be
@@ -1749,6 +1767,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementInst, Value)
/// element into a VectorType value
///
class InsertElementInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
+
InsertElementInst(Value *Vec, Value *NewElt, Value *Idx,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr);
@@ -1763,7 +1783,8 @@ public:
static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- return new(3) InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore);
+ return new (AllocMarker)
+ InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore);
}
/// Return true if an insertelement instruction can be
@@ -1813,6 +1834,8 @@ constexpr int PoisonMaskElem = -1;
/// For scalable vectors, all the elements of the mask must be 0 or -1. This
/// requirement may be relaxed in the future.
class ShuffleVectorInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
SmallVector<int, 4> ShuffleMask;
Constant *ShuffleMaskForBitcode;
@@ -1834,7 +1857,7 @@ public:
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr);
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { return User::operator delete(Ptr); }
/// Swap the operands and adjust the mask to preserve the semantics
@@ -2395,6 +2418,8 @@ ExtractValueInst::ExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
/// value into an aggregate value.
///
class InsertValueInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
SmallVector<unsigned, 4> Indices;
InsertValueInst(const InsertValueInst &IVI);
@@ -2423,7 +2448,7 @@ protected:
public:
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
static InsertValueInst *Create(Value *Agg, Value *Val,
@@ -2493,9 +2518,7 @@ struct OperandTraits<InsertValueInst> :
InsertValueInst::InsertValueInst(Value *Agg, Value *Val,
ArrayRef<unsigned> Idxs, const Twine &NameStr,
InsertPosition InsertBefore)
- : Instruction(Agg->getType(), InsertValue,
- OperandTraits<InsertValueInst>::op_begin(this), 2,
- InsertBefore) {
+ : Instruction(Agg->getType(), InsertValue, AllocMarker, InsertBefore) {
init(Agg, Val, Idxs, NameStr);
}
@@ -2510,6 +2533,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueInst, Value)
// scientist's overactive imagination.
//
class PHINode : public Instruction {
+ constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
/// The number of operands actually allocated. NumOperands is
/// the number actually in use.
unsigned ReservedSpace;
@@ -2519,7 +2544,7 @@ class PHINode : public Instruction {
explicit PHINode(Type *Ty, unsigned NumReservedValues,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr)
- : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore),
+ : Instruction(Ty, Instruction::PHI, AllocMarker, InsertBefore),
ReservedSpace(NumReservedValues) {
assert(!Ty->isTokenTy() && "PHI nodes cannot have token type!");
setName(NameStr);
@@ -2545,7 +2570,8 @@ public:
static PHINode *Create(Type *Ty, unsigned NumReservedValues,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
+ return new (AllocMarker)
+ PHINode(Ty, NumReservedValues, NameStr, InsertBefore);
}
/// Provide fast operand accessors
@@ -2749,6 +2775,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(PHINode, Value)
class LandingPadInst : public Instruction {
using CleanupField = BoolBitfieldElementT<0>;
+ constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
/// The number of operands actually allocated. NumOperands is
/// the number actually in use.
unsigned ReservedSpace;
@@ -2763,7 +2791,7 @@ private:
const Twine &NameStr, InsertPosition InsertBefore);
// Allocate space for exactly zero operands.
- void *operator new(size_t S) { return User::operator new(S); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void growOperands(unsigned Size);
void init(unsigned NumReservedValues, const Twine &NameStr);
@@ -2843,7 +2871,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value)
/// does not continue in this function any longer.
///
class ReturnInst : public Instruction {
- ReturnInst(const ReturnInst &RI);
+ ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo);
private:
// ReturnInst constructors:
@@ -2859,8 +2887,8 @@ private:
//
// NOTE: If the Value* passed is of type void then the constructor behaves as
// if it was passed NULL.
- explicit ReturnInst(LLVMContext &C, Value *retVal = nullptr,
- InsertPosition InsertBefore = nullptr);
+ explicit ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo,
+ InsertPosition InsertBefore);
protected:
// Note: Instruction needs to be a friend here to call cloneImpl.
@@ -2871,11 +2899,13 @@ protected:
public:
static ReturnInst *Create(LLVMContext &C, Value *retVal = nullptr,
InsertPosition InsertBefore = nullptr) {
- return new(!!retVal) ReturnInst(C, retVal, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{retVal ? 1U : 0U};
+ return new (AllocMarker) ReturnInst(C, retVal, AllocMarker, InsertBefore);
}
static ReturnInst *Create(LLVMContext &C, BasicBlock *InsertAtEnd) {
- return new (0) ReturnInst(C, nullptr, InsertAtEnd);
+ IntrusiveOperandsAllocMarker AllocMarker{0};
+ return new (AllocMarker) ReturnInst(C, nullptr, AllocMarker, InsertAtEnd);
}
/// Provide fast operand accessors
@@ -2923,7 +2953,7 @@ class BranchInst : public Instruction {
/// [Cond, FalseDest,] TrueDest. This makes some accessors faster because
/// they don't have to check for cond/uncond branchness. These are mostly
/// accessed relative from op_end().
- BranchInst(const BranchInst &BI);
+ BranchInst(const BranchInst &BI, AllocInfo AllocInfo);
// BranchInst constructors (where {B, T, F} are blocks, and C is a condition):
// BranchInst(BB *B) - 'br B'
// BranchInst(BB* T, BB *F, Value *C) - 'br C, T, F'
@@ -2933,10 +2963,10 @@ class BranchInst : public Instruction {
// BranchInst(BB* T, BB *F, Value *C, Inst *I) - 'br C, T, F', insert before I
// BranchInst(BB* B, BB *I) - 'br B' insert at end
// BranchInst(BB* T, BB *F, Value *C, BB *I) - 'br C, T, F', insert at end
- explicit BranchInst(BasicBlock *IfTrue,
- InsertPosition InsertBefore = nullptr);
+ explicit BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo,
+ InsertPosition InsertBefore);
BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
- InsertPosition InsertBefore = nullptr);
+ AllocInfo AllocInfo, InsertPosition InsertBefore);
void AssertOK();
@@ -2976,13 +3006,16 @@ public:
static BranchInst *Create(BasicBlock *IfTrue,
InsertPosition InsertBefore = nullptr) {
- return new(1) BranchInst(IfTrue, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{1};
+ return new (AllocMarker) BranchInst(IfTrue, AllocMarker, InsertBefore);
}
static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse,
Value *Cond,
InsertPosition InsertBefore = nullptr) {
- return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{3};
+ return new (AllocMarker)
+ BranchInst(IfTrue, IfFalse, Cond, AllocMarker, InsertBefore);
}
/// Transparently provide more efficient getOperand methods.
@@ -3054,6 +3087,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
/// Multiway switch
///
class SwitchInst : public Instruction {
+ constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
unsigned ReservedSpace;
// Operand[0] = Value to switch on
@@ -3070,7 +3105,7 @@ class SwitchInst : public Instruction {
InsertPosition InsertBefore);
// allocate space for exactly zero operands
- void *operator new(size_t S) { return User::operator new(S); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void init(Value *Value, BasicBlock *Default, unsigned NumReserved);
void growOperands();
@@ -3442,6 +3477,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
/// Indirect Branch Instruction.
///
class IndirectBrInst : public Instruction {
+ constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
unsigned ReservedSpace;
// Operand[0] = Address to jump to
@@ -3456,7 +3493,7 @@ class IndirectBrInst : public Instruction {
InsertPosition InsertBefore);
// allocate space for exactly zero operands
- void *operator new(size_t S) { return User::operator new(S); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void init(Value *Address, unsigned NumDests);
void growOperands();
@@ -3576,14 +3613,14 @@ class InvokeInst : public CallBase {
/// The index from the end of the operand array to the unwind destination.
static constexpr int UnwindDestOpEndIdx = -2;
- InvokeInst(const InvokeInst &BI);
+ InvokeInst(const InvokeInst &BI, AllocInfo AllocInfo);
/// Construct an InvokeInst given a range of arguments.
///
/// Construct an InvokeInst from a range of arguments
inline InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
BasicBlock *IfException, ArrayRef<Value *> Args,
- ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+ ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo,
const Twine &NameStr, InsertPosition InsertBefore);
void init(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
@@ -3591,10 +3628,11 @@ class InvokeInst : public CallBase {
ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
/// Compute the number of operands to allocate.
- static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) {
+ static unsigned ComputeNumOperands(unsigned NumArgs,
+ size_t NumBundleInputs = 0) {
// We need one operand for the called function, plus our extra operands and
// the input operand counts provided.
- return 1 + NumExtraOperands + NumArgs + NumBundleInputs;
+ return 1 + NumExtraOperands + NumArgs + unsigned(NumBundleInputs);
}
protected:
@@ -3608,10 +3646,11 @@ public:
BasicBlock *IfException, ArrayRef<Value *> Args,
const Twine &NameStr,
InsertPosition InsertBefore = nullptr) {
- int NumOperands = ComputeNumOperands(Args.size());
- return new (NumOperands)
+ IntrusiveOperandsAllocMarker AllocMarker{
+ ComputeNumOperands(unsigned(Args.size()))};
+ return new (AllocMarker)
InvokeInst(Ty, Func, IfNormal, IfException, Args, std::nullopt,
- NumOperands, NameStr, InsertBefore);
+ AllocMarker, NameStr, InsertBefore);
}
static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
@@ -3619,12 +3658,12 @@ public:
ArrayRef<OperandBundleDef> Bundles = std::nullopt,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- int NumOperands =
- ComputeNumOperands(Args.size(), CountBundleInputs(Bundles));
- unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+ IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+ ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)),
+ unsigned(Bundles.size() * sizeof(BundleOpInfo))};
- return new (NumOperands, DescriptorBytes)
- InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, NumOperands,
+ return new (AllocMarker)
+ InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, AllocMarker,
NameStr, InsertBefore);
}
@@ -3709,10 +3748,9 @@ private:
InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal,
BasicBlock *IfException, ArrayRef<Value *> Args,
- ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+ ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo,
const Twine &NameStr, InsertPosition InsertBefore)
- : CallBase(Ty->getReturnType(), Instruction::Invoke,
- OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+ : CallBase(Ty->getReturnType(), Instruction::Invoke, AllocInfo,
InsertBefore) {
init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr);
}
@@ -3729,7 +3767,7 @@ class CallBrInst : public CallBase {
unsigned NumIndirectDests;
- CallBrInst(const CallBrInst &BI);
+ CallBrInst(const CallBrInst &BI, AllocInfo AllocInfo);
/// Construct a CallBrInst given a range of arguments.
///
@@ -3737,7 +3775,7 @@ class CallBrInst : public CallBase {
inline CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
ArrayRef<BasicBlock *> IndirectDests,
ArrayRef<Value *> Args, ArrayRef<OperandBundleDef> Bundles,
- int NumOperands, const Twine &NameStr,
+ AllocInfo AllocInfo, const Twine &NameStr,
InsertPosition InsertBefore);
void init(FunctionType *FTy, Value *Func, BasicBlock *DefaultDest,
@@ -3745,11 +3783,11 @@ class CallBrInst : public CallBase {
ArrayRef<OperandBundleDef> Bundles, const Twine &NameStr);
/// Compute the number of operands to allocate.
- static int ComputeNumOperands(int NumArgs, int NumIndirectDests,
- int NumBundleInputs = 0) {
+ static unsigned ComputeNumOperands(int NumArgs, int NumIndirectDests,
+ int NumBundleInputs = 0) {
// We need one operand for the called function, plus our extra operands and
// the input operand counts provided.
- return 2 + NumIndirectDests + NumArgs + NumBundleInputs;
+ return unsigned(2 + NumIndirectDests + NumArgs + NumBundleInputs);
}
protected:
@@ -3764,10 +3802,11 @@ public:
ArrayRef<BasicBlock *> IndirectDests,
ArrayRef<Value *> Args, const Twine &NameStr,
InsertPosition InsertBefore = nullptr) {
- int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size());
- return new (NumOperands)
+ IntrusiveOperandsAllocMarker AllocMarker{
+ ComputeNumOperands(Args.size(), IndirectDests.size())};
+ return new (AllocMarker)
CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, std::nullopt,
- NumOperands, NameStr, InsertBefore);
+ AllocMarker, NameStr, InsertBefore);
}
static CallBrInst *
@@ -3775,13 +3814,14 @@ public:
ArrayRef<BasicBlock *> IndirectDests, ArrayRef<Value *> Args,
ArrayRef<OperandBundleDef> Bundles = std::nullopt,
const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) {
- int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size(),
- CountBundleInputs(Bundles));
- unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo);
+ IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+ ComputeNumOperands(Args.size(), IndirectDests.size(),
+ CountBundleInputs(Bundles)),
+ unsigned(Bundles.size() * sizeof(BundleOpInfo))};
- return new (NumOperands, DescriptorBytes)
+ return new (AllocMarker)
CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, Bundles,
- NumOperands, NameStr, InsertBefore);
+ AllocMarker, NameStr, InsertBefore);
}
static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest,
@@ -3881,10 +3921,9 @@ private:
CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
ArrayRef<BasicBlock *> IndirectDests,
ArrayRef<Value *> Args,
- ArrayRef<OperandBundleDef> Bundles, int NumOperands,
+ ArrayRef<OperandBundleDef> Bundles, AllocInfo AllocInfo,
const Twine &NameStr, InsertPosition InsertBefore)
- : CallBase(Ty->getReturnType(), Instruction::CallBr,
- OperandTraits<CallBase>::op_end(this) - NumOperands, NumOperands,
+ : CallBase(Ty->getReturnType(), Instruction::CallBr, AllocInfo,
InsertBefore) {
init(Ty, Func, DefaultDest, IndirectDests, Args, Bundles, NameStr);
}
@@ -3897,6 +3936,8 @@ CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest,
/// Resume the propagation of an exception.
///
class ResumeInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
ResumeInst(const ResumeInst &RI);
explicit ResumeInst(Value *Exn, InsertPosition InsertBefore = nullptr);
@@ -3909,7 +3950,7 @@ protected:
public:
static ResumeInst *Create(Value *Exn, InsertPosition InsertBefore = nullptr) {
- return new(1) ResumeInst(Exn, InsertBefore);
+ return new (AllocMarker) ResumeInst(Exn, InsertBefore);
}
/// Provide fast operand accessors
@@ -3951,6 +3992,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value)
class CatchSwitchInst : public Instruction {
using UnwindDestField = BoolBitfieldElementT<0>;
+ constexpr static HungOffOperandsAllocMarker AllocMarker{};
+
/// The number of operands actually allocated. NumOperands is
/// the number actually in use.
unsigned ReservedSpace;
@@ -3969,7 +4012,7 @@ class CatchSwitchInst : public Instruction {
InsertPosition InsertBefore);
// allocate space for exactly zero operands
- void *operator new(size_t S) { return User::operator new(S); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved);
void growOperands(unsigned Size);
@@ -4114,9 +4157,9 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchSwitchInst, Value)
class CleanupPadInst : public FuncletPadInst {
private:
explicit CleanupPadInst(Value *ParentPad, ArrayRef<Value *> Args,
- unsigned Values, const Twine &NameStr,
+ AllocInfo AllocInfo, const Twine &NameStr,
InsertPosition InsertBefore)
- : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, Values,
+ : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, AllocInfo,
NameStr, InsertBefore) {}
public:
@@ -4124,9 +4167,9 @@ public:
ArrayRef<Value *> Args = std::nullopt,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- unsigned Values = 1 + Args.size();
- return new (Values)
- CleanupPadInst(ParentPad, Args, Values, NameStr, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())};
+ return new (AllocMarker)
+ CleanupPadInst(ParentPad, Args, AllocMarker, NameStr, InsertBefore);
}
/// Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -4144,18 +4187,18 @@ public:
class CatchPadInst : public FuncletPadInst {
private:
explicit CatchPadInst(Value *CatchSwitch, ArrayRef<Value *> Args,
- unsigned Values, const Twine &NameStr,
+ AllocInfo AllocInfo, const Twine &NameStr,
InsertPosition InsertBefore)
- : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, Values,
+ : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, AllocInfo,
NameStr, InsertBefore) {}
public:
static CatchPadInst *Create(Value *CatchSwitch, ArrayRef<Value *> Args,
const Twine &NameStr = "",
InsertPosition InsertBefore = nullptr) {
- unsigned Values = 1 + Args.size();
- return new (Values)
- CatchPadInst(CatchSwitch, Args, Values, NameStr, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())};
+ return new (AllocMarker)
+ CatchPadInst(CatchSwitch, Args, AllocMarker, NameStr, InsertBefore);
}
/// Convenience accessors
@@ -4181,6 +4224,8 @@ public:
//===----------------------------------------------------------------------===//
class CatchReturnInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
CatchReturnInst(const CatchReturnInst &RI);
CatchReturnInst(Value *CatchPad, BasicBlock *BB, InsertPosition InsertBefore);
@@ -4197,7 +4242,7 @@ public:
InsertPosition InsertBefore = nullptr) {
assert(CatchPad);
assert(BB);
- return new (2) CatchReturnInst(CatchPad, BB, InsertBefore);
+ return new (AllocMarker) CatchReturnInst(CatchPad, BB, InsertBefore);
}
/// Provide fast operand accessors
@@ -4257,9 +4302,9 @@ class CleanupReturnInst : public Instruction {
using UnwindDestField = BoolBitfieldElementT<0>;
private:
- CleanupReturnInst(const CleanupReturnInst &RI);
- CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values,
- InsertPosition InsertBefore = nullptr);
+ CleanupReturnInst(const CleanupReturnInst &RI, AllocInfo AllocInfo);
+ CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
+ AllocInfo AllocInfo, InsertPosition InsertBefore = nullptr);
void init(Value *CleanupPad, BasicBlock *UnwindBB);
@@ -4277,8 +4322,9 @@ public:
unsigned Values = 1;
if (UnwindBB)
++Values;
- return new (Values)
- CleanupReturnInst(CleanupPad, UnwindBB, Values, InsertBefore);
+ IntrusiveOperandsAllocMarker AllocMarker{Values};
+ return new (AllocMarker)
+ CleanupReturnInst(CleanupPad, UnwindBB, AllocMarker, InsertBefore);
}
/// Provide fast operand accessors
@@ -4350,6 +4396,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value)
/// end of the block cannot be reached.
///
class UnreachableInst : public Instruction {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{0};
+
protected:
// Note: Instruction needs to be a friend here to call cloneImpl.
friend class Instruction;
@@ -4361,7 +4409,7 @@ public:
InsertPosition InsertBefore = nullptr);
// allocate space for exactly zero operands
- void *operator new(size_t S) { return User::operator new(S, 0); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
unsigned getNumSuccessors() const { return 0; }
diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h
index 910815f..39e1314 100644
--- a/llvm/include/llvm/IR/User.h
+++ b/llvm/include/llvm/IR/User.h
@@ -43,39 +43,86 @@ struct OperandTraits;
class User : public Value {
friend struct HungoffOperandTraits;
+ template <class ConstantClass> friend struct ConstantAggrKeyType;
LLVM_ATTRIBUTE_ALWAYS_INLINE static void *
allocateFixedOperandUser(size_t, unsigned, unsigned);
protected:
+ // Disable the default operator new, as all subclasses must use one of the
+ // custom operators below depending on how they store their operands.
+ void *operator new(size_t Size) = delete;
+
+ /// Indicates this User has operands "hung off" in another allocation.
+ struct HungOffOperandsAllocMarker {};
+
+ /// Indicates this User has operands co-allocated.
+ struct IntrusiveOperandsAllocMarker {
+ /// The number of operands for this User.
+ const unsigned NumOps;
+ };
+
+ /// Indicates this User has operands and a descriptor co-allocated .
+ struct IntrusiveOperandsAndDescriptorAllocMarker {
+ /// The number of operands for this User.
+ const unsigned NumOps;
+ /// The number of bytes to allocate for the descriptor. Must be divisible by
+ /// `sizeof(void *)`.
+ const unsigned DescBytes;
+ };
+
+ /// Information about how a User object was allocated, to be passed into the
+ /// User constructor.
+ ///
+ /// DO NOT USE DIRECTLY. Use one of the `AllocMarker` structs instead, they
+ /// call all be implicitly converted to `AllocInfo`.
+ struct AllocInfo {
+ public:
+ const unsigned NumOps : NumUserOperandsBits;
+ const bool HasHungOffUses : 1;
+ const bool HasDescriptor : 1;
+
+ AllocInfo() = delete;
+
+ constexpr AllocInfo(const HungOffOperandsAllocMarker)
+ : NumOps(0), HasHungOffUses(true), HasDescriptor(false) {}
+
+ constexpr AllocInfo(const IntrusiveOperandsAllocMarker Alloc)
+ : NumOps(Alloc.NumOps), HasHungOffUses(false), HasDescriptor(false) {}
+
+ constexpr AllocInfo(const IntrusiveOperandsAndDescriptorAllocMarker Alloc)
+ : NumOps(Alloc.NumOps), HasHungOffUses(false),
+ HasDescriptor(Alloc.DescBytes != 0) {}
+ };
+
/// Allocate a User with an operand pointer co-allocated.
///
/// This is used for subclasses which need to allocate a variable number
/// of operands, ie, 'hung off uses'.
- void *operator new(size_t Size);
+ void *operator new(size_t Size, HungOffOperandsAllocMarker);
/// Allocate a User with the operands co-allocated.
///
/// This is used for subclasses which have a fixed number of operands.
- void *operator new(size_t Size, unsigned Us);
+ void *operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait);
/// Allocate a User with the operands co-allocated. If DescBytes is non-zero
/// then allocate an additional DescBytes bytes before the operands. These
/// bytes can be accessed by calling getDescriptor.
- ///
- /// DescBytes needs to be divisible by sizeof(void *). The allocated
- /// descriptor, if any, is aligned to sizeof(void *) bytes.
- ///
- /// This is used for subclasses which have a fixed number of operands.
- void *operator new(size_t Size, unsigned Us, unsigned DescBytes);
-
- User(Type *ty, unsigned vty, Use *, unsigned NumOps)
- : Value(ty, vty) {
- assert(NumOps < (1u << NumUserOperandsBits) && "Too many operands");
- NumUserOperands = NumOps;
+ void *operator new(size_t Size,
+ IntrusiveOperandsAndDescriptorAllocMarker allocTrait);
+
+ User(Type *ty, unsigned vty, AllocInfo AllocInfo) : Value(ty, vty) {
+ assert(AllocInfo.NumOps < (1u << NumUserOperandsBits) &&
+ "Too many operands");
+ NumUserOperands = AllocInfo.NumOps;
+ assert((!AllocInfo.HasDescriptor || !AllocInfo.HasHungOffUses) &&
+ "Cannot have both hung off uses and a descriptor");
+ HasHungOffUses = AllocInfo.HasHungOffUses;
+ HasDescriptor = AllocInfo.HasDescriptor;
// If we have hung off uses, then the operand list should initially be
// null.
- assert((!HasHungOffUses || !getOperandList()) &&
+ assert((!AllocInfo.HasHungOffUses || !getOperandList()) &&
"Error in initializing hung off uses for User");
}
@@ -98,7 +145,20 @@ public:
/// Free memory allocated for User and Use objects.
void operator delete(void *Usr);
/// Placement delete - required by std, called if the ctor throws.
- void operator delete(void *Usr, unsigned) {
+ void operator delete(void *Usr, HungOffOperandsAllocMarker) {
+ // Note: If a subclass manipulates the information which is required to
+ // calculate the Usr memory pointer, e.g. NumUserOperands, the operator
+ // delete of that subclass has to restore the changed information to the
+ // original value, since the dtor of that class is not called if the ctor
+ // fails.
+ User::operator delete(Usr);
+
+#ifndef LLVM_ENABLE_EXCEPTIONS
+ llvm_unreachable("Constructor throws?");
+#endif
+ }
+ /// Placement delete - required by std, called if the ctor throws.
+ void operator delete(void *Usr, IntrusiveOperandsAllocMarker) {
// Note: If a subclass manipulates the information which is required to calculate the
// Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has
// to restore the changed information to the original value, since the dtor of that class
@@ -110,7 +170,7 @@ public:
#endif
}
/// Placement delete - required by std, called if the ctor throws.
- void operator delete(void *Usr, unsigned, unsigned) {
+ void operator delete(void *Usr, IntrusiveOperandsAndDescriptorAllocMarker) {
// Note: If a subclass manipulates the information which is required to calculate the
// Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has
// to restore the changed information to the original value, since the dtor of that class
@@ -195,19 +255,6 @@ public:
/// Returns the descriptor co-allocated with this User instance.
MutableArrayRef<uint8_t> getDescriptor();
- /// Set the number of operands on a GlobalVariable.
- ///
- /// GlobalVariable always allocates space for a single operands, but
- /// doesn't always use it.
- ///
- /// FIXME: As that the number of operands is used to find the start of
- /// the allocated memory in operator delete, we need to always think we have
- /// 1 operand before delete.
- void setGlobalVariableNumOperands(unsigned NumOps) {
- assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands");
- NumUserOperands = NumOps;
- }
-
/// Subclasses with hung off uses need to manage the operand count
/// themselves. In these instances, the operand count isn't used to find the
/// OperandList, so there's no issue in having the operand count change.
diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 2fdbbbd..8888468 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -1506,6 +1506,10 @@ public:
static Value *create(Value *Cond, Value *True, Value *False,
BasicBlock *InsertAtEnd, Context &Ctx,
const Twine &Name = "");
+
+ const Value *getCondition() const { return getOperand(0); }
+ const Value *getTrueValue() const { return getOperand(1); }
+ const Value *getFalseValue() const { return getOperand(2); }
Value *getCondition() { return getOperand(0); }
Value *getTrueValue() { return getOperand(1); }
Value *getFalseValue() { return getOperand(2); }
@@ -1513,7 +1517,16 @@ public:
void setCondition(Value *New) { setOperand(0, New); }
void setTrueValue(Value *New) { setOperand(1, New); }
void setFalseValue(Value *New) { setOperand(2, New); }
- void swapValues() { cast<llvm::SelectInst>(Val)->swapValues(); }
+ void swapValues();
+
+ /// Return a string if the specified operands are invalid for a select
+ /// operation, otherwise return null.
+ static const char *areInvalidOperands(Value *Cond, Value *True,
+ Value *False) {
+ return llvm::SelectInst::areInvalidOperands(Cond->Val, True->Val,
+ False->Val);
+ }
+
/// For isa/dyn_cast.
static bool classof(const Value *From);
};
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 14a32cc..66fdcb4 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -755,7 +755,7 @@ template <class X, class Y> auto dyn_cast_if_present(Y *Val) {
// Forwards to dyn_cast_if_present to avoid breaking current users. This is
// deprecated and will be removed in a future patch, use
-// cast_if_present instead.
+// dyn_cast_if_present instead.
template <class X, class Y> auto dyn_cast_or_null(const Y &Val) {
return dyn_cast_if_present<X>(Val);
}
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index e32a54f..6d035d5 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -1267,9 +1267,9 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
}
ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT,
- ArrayRef<Constant *> V)
- : Constant(T, VT, OperandTraits<ConstantAggregate>::op_end(this) - V.size(),
- V.size()) {
+ ArrayRef<Constant *> V,
+ AllocInfo AllocInfo)
+ : Constant(T, VT, AllocInfo) {
llvm::copy(V, op_begin());
// Check that types match, unless this is an opaque struct.
@@ -1282,8 +1282,9 @@ ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT,
}
}
-ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
- : ConstantAggregate(T, ConstantArrayVal, V) {
+ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V,
+ AllocInfo AllocInfo)
+ : ConstantAggregate(T, ConstantArrayVal, V, AllocInfo) {
assert(V.size() == T->getNumElements() &&
"Invalid initializer for constant array");
}
@@ -1346,8 +1347,9 @@ StructType *ConstantStruct::getTypeForElements(ArrayRef<Constant*> V,
return getTypeForElements(V[0]->getContext(), V, Packed);
}
-ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V)
- : ConstantAggregate(T, ConstantStructVal, V) {
+ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V,
+ AllocInfo AllocInfo)
+ : ConstantAggregate(T, ConstantStructVal, V, AllocInfo) {
assert((T->isOpaque() || V.size() == T->getNumElements()) &&
"Invalid initializer for constant struct");
}
@@ -1388,8 +1390,9 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V);
}
-ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
- : ConstantAggregate(T, ConstantVectorVal, V) {
+ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V,
+ AllocInfo AllocInfo)
+ : ConstantAggregate(T, ConstantVectorVal, V, AllocInfo) {
assert(V.size() == cast<FixedVectorType>(T)->getNumElements() &&
"Invalid initializer for constant vector");
}
@@ -1879,7 +1882,7 @@ BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) {
BlockAddress::BlockAddress(Function *F, BasicBlock *BB)
: Constant(PointerType::get(F->getContext(), F->getAddressSpace()),
- Value::BlockAddressVal, &Op<0>(), 2) {
+ Value::BlockAddressVal, AllocMarker) {
setOperand(0, F);
setOperand(1, BB);
BB->AdjustBlockAddressRefCount(1);
@@ -1951,7 +1954,7 @@ DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) {
}
DSOLocalEquivalent::DSOLocalEquivalent(GlobalValue *GV)
- : Constant(GV->getType(), Value::DSOLocalEquivalentVal, &Op<0>(), 1) {
+ : Constant(GV->getType(), Value::DSOLocalEquivalentVal, AllocMarker) {
setOperand(0, GV);
}
@@ -2009,7 +2012,7 @@ NoCFIValue *NoCFIValue::get(GlobalValue *GV) {
}
NoCFIValue::NoCFIValue(GlobalValue *GV)
- : Constant(GV->getType(), Value::NoCFIValueVal, &Op<0>(), 1) {
+ : Constant(GV->getType(), Value::NoCFIValueVal, AllocMarker) {
setOperand(0, GV);
}
@@ -2056,7 +2059,7 @@ ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const {
ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key,
ConstantInt *Disc, Constant *AddrDisc)
- : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) {
+ : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, AllocMarker) {
assert(Ptr->getType()->isPointerTy());
assert(Key->getBitWidth() == 32);
assert(Disc->getBitWidth() == 64);
@@ -2758,11 +2761,8 @@ const char *ConstantExpr::getOpcodeName() const {
GetElementPtrConstantExpr::GetElementPtrConstantExpr(
Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList, Type *DestTy,
- std::optional<ConstantRange> InRange)
- : ConstantExpr(DestTy, Instruction::GetElementPtr,
- OperandTraits<GetElementPtrConstantExpr>::op_end(this) -
- (IdxList.size() + 1),
- IdxList.size() + 1),
+ std::optional<ConstantRange> InRange, AllocInfo AllocInfo)
+ : ConstantExpr(DestTy, Instruction::GetElementPtr, AllocInfo),
SrcElementTy(SrcElementTy),
ResElementTy(GetElementPtrInst::getIndexedType(SrcElementTy, IdxList)),
InRange(std::move(InRange)) {
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index bd19ec6..6afc86f 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -44,14 +44,16 @@ namespace llvm {
/// CastConstantExpr - This class is private to Constants.cpp, and is used
/// behind the scenes to implement cast constant exprs.
class CastConstantExpr final : public ConstantExpr {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{1};
+
public:
CastConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
- : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
+ : ConstantExpr(Ty, Opcode, AllocMarker) {
Op<0>() = C;
}
// allocate space for exactly one operand
- void *operator new(size_t S) { return User::operator new(S, 1); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
@@ -67,17 +69,19 @@ public:
/// BinaryConstantExpr - This class is private to Constants.cpp, and is used
/// behind the scenes to implement binary constant exprs.
class BinaryConstantExpr final : public ConstantExpr {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
public:
BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2,
unsigned Flags)
- : ConstantExpr(C1->getType(), Opcode, &Op<0>(), 2) {
+ : ConstantExpr(C1->getType(), Opcode, AllocMarker) {
Op<0>() = C1;
Op<1>() = C2;
SubclassOptionalData = Flags;
}
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Transparently provide more efficient getOperand methods.
@@ -95,16 +99,18 @@ public:
/// Constants.cpp, and is used behind the scenes to implement
/// extractelement constant exprs.
class ExtractElementConstantExpr final : public ConstantExpr {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
public:
ExtractElementConstantExpr(Constant *C1, Constant *C2)
- : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
- Instruction::ExtractElement, &Op<0>(), 2) {
+ : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
+ Instruction::ExtractElement, AllocMarker) {
Op<0>() = C1;
Op<1>() = C2;
}
// allocate space for exactly two operands
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Transparently provide more efficient getOperand methods.
@@ -122,17 +128,18 @@ public:
/// Constants.cpp, and is used behind the scenes to implement
/// insertelement constant exprs.
class InsertElementConstantExpr final : public ConstantExpr {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{3};
+
public:
InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
- : ConstantExpr(C1->getType(), Instruction::InsertElement,
- &Op<0>(), 3) {
+ : ConstantExpr(C1->getType(), Instruction::InsertElement, AllocMarker) {
Op<0>() = C1;
Op<1>() = C2;
Op<2>() = C3;
}
// allocate space for exactly three operands
- void *operator new(size_t S) { return User::operator new(S, 3); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { User::operator delete(Ptr); }
/// Transparently provide more efficient getOperand methods.
@@ -150,12 +157,14 @@ public:
/// Constants.cpp, and is used behind the scenes to implement
/// shufflevector constant exprs.
class ShuffleVectorConstantExpr final : public ConstantExpr {
+ constexpr static IntrusiveOperandsAllocMarker AllocMarker{2};
+
public:
ShuffleVectorConstantExpr(Constant *C1, Constant *C2, ArrayRef<int> Mask)
: ConstantExpr(VectorType::get(
cast<VectorType>(C1->getType())->getElementType(),
Mask.size(), isa<ScalableVectorType>(C1->getType())),
- Instruction::ShuffleVector, &Op<0>(), 2) {
+ Instruction::ShuffleVector, AllocMarker) {
assert(ShuffleVectorInst::isValidOperands(C1, C2, Mask) &&
"Invalid shuffle vector instruction operands!");
Op<0>() = C1;
@@ -168,7 +177,7 @@ public:
SmallVector<int, 4> ShuffleMask;
Constant *ShuffleMaskForBitcode;
- void *operator new(size_t S) { return User::operator new(S, 2); }
+ void *operator new(size_t S) { return User::operator new(S, AllocMarker); }
void operator delete(void *Ptr) { return User::operator delete(Ptr); }
/// Transparently provide more efficient getOperand methods.
@@ -191,15 +200,17 @@ class GetElementPtrConstantExpr : public ConstantExpr {
GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C,
ArrayRef<Constant *> IdxList, Type *DestTy,
- std::optional<ConstantRange> InRange);
+ std::optional<ConstantRange> InRange,
+ AllocInfo AllocInfo);
public:
static GetElementPtrConstantExpr *
Create(Type *SrcElementTy, Constant *C, ArrayRef<Constant *> IdxList,
Type *DestTy, unsigned Flags, std::optional<ConstantRange> InRange) {
- GetElementPtrConstantExpr *Result = new (IdxList.size() + 1)
+ IntrusiveOperandsAllocMarker AllocMarker{unsigned(IdxList.size() + 1)};
+ GetElementPtrConstantExpr *Result = new (AllocMarker)
GetElementPtrConstantExpr(SrcElementTy, C, IdxList, DestTy,
- std::move(InRange));
+ std::move(InRange), AllocMarker);
Result->SubclassOptionalData = Flags;
return Result;
}
@@ -318,7 +329,8 @@ template <class ConstantClass> struct ConstantAggrKeyType {
using TypeClass = typename ConstantInfo<ConstantClass>::TypeClass;
ConstantClass *create(TypeClass *Ty) const {
- return new (Operands.size()) ConstantClass(Ty, Operands);
+ User::IntrusiveOperandsAllocMarker AllocMarker{unsigned(Operands.size())};
+ return new (AllocMarker) ConstantClass(Ty, Operands, AllocMarker);
}
};
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index afef893..82ff4e1 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -402,7 +402,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
LinkageTypes Linkage,
unsigned AddrSpace, const Twine &N,
Module *M) {
- auto *F = new Function(Ty, Linkage, AddrSpace, N, M);
+ auto *F = new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M);
AttrBuilder B(F->getContext());
UWTableKind UWTable = M->getUwtable();
if (UWTable != UWTableKind::None)
@@ -501,8 +501,7 @@ static unsigned computeAddrSpace(unsigned AddrSpace, Module *M) {
Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
const Twine &name, Module *ParentModule)
- : GlobalObject(Ty, Value::FunctionVal,
- OperandTraits<Function>::op_begin(this), 0, Linkage, name,
+ : GlobalObject(Ty, Value::FunctionVal, AllocMarker, Linkage, name,
computeAddrSpace(AddrSpace, ParentModule)),
NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
assert(FunctionType::isValidReturnType(getReturnType()) &&
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 2bc69cd..99f4fa5 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -442,9 +442,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
Constant *InitVal, const Twine &Name,
ThreadLocalMode TLMode, unsigned AddressSpace,
bool isExternallyInitialized)
- : GlobalObject(Ty, Value::GlobalVariableVal,
- OperandTraits<GlobalVariable>::op_begin(this),
- InitVal != nullptr, Link, Name, AddressSpace),
+ : GlobalObject(Ty, Value::GlobalVariableVal, AllocMarker, Link, Name,
+ AddressSpace),
isConstantGlobal(constant),
isExternallyInitializedConstant(isExternallyInitialized) {
assert(!Ty->isFunctionTy() && PointerType::isValidElementType(Ty) &&
@@ -454,6 +453,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
assert(InitVal->getType() == Ty &&
"Initializer should be the same type as the GlobalVariable!");
Op<0>() = InitVal;
+ } else {
+ setGlobalVariableNumOperands(0);
}
}
@@ -540,7 +541,7 @@ void GlobalVariable::setCodeModel(CodeModel::Model CM) {
GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
const Twine &Name, Constant *Aliasee,
Module *ParentModule)
- : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name,
+ : GlobalValue(Ty, Value::GlobalAliasVal, AllocMarker, Link, Name,
AddressSpace) {
setAliasee(Aliasee);
if (ParentModule)
@@ -597,7 +598,7 @@ const GlobalObject *GlobalAlias::getAliaseeObject() const {
GlobalIFunc::GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Link,
const Twine &Name, Constant *Resolver,
Module *ParentModule)
- : GlobalObject(Ty, Value::GlobalIFuncVal, &Op<0>(), 1, Link, Name,
+ : GlobalObject(Ty, Value::GlobalIFuncVal, AllocMarker, Link, Name,
AddressSpace) {
setResolver(Resolver);
if (ParentModule)
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 62d88ce..b1c2b02 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -32,9 +32,9 @@ InsertPosition::InsertPosition(Instruction *InsertBefore)
InsertPosition::InsertPosition(BasicBlock *InsertAtEnd)
: InsertAt(InsertAtEnd ? InsertAtEnd->end() : InstListType::iterator()) {}
-Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+Instruction::Instruction(Type *ty, unsigned it, AllocInfo AllocInfo,
InsertPosition InsertBefore)
- : User(ty, Value::InstructionVal + it, Ops, NumOps) {
+ : User(ty, Value::InstructionVal + it, AllocInfo) {
// When called with an iterator, there must be a block to insert into.
if (InstListType::iterator InsertIt = InsertBefore; InsertIt.isValid()) {
BasicBlock *BB = InsertIt.getNodeParent();
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 19da1f6..e95b98a 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -121,8 +121,9 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
//===----------------------------------------------------------------------===//
PHINode::PHINode(const PHINode &PN)
- : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()),
+ : Instruction(PN.getType(), Instruction::PHI, AllocMarker),
ReservedSpace(PN.getNumOperands()) {
+ NumUserOperands = PN.getNumOperands();
allocHungoffUses(PN.getNumOperands());
std::copy(PN.op_begin(), PN.op_end(), op_begin());
copyIncomingBlocks(make_range(PN.block_begin(), PN.block_end()));
@@ -243,14 +244,14 @@ bool PHINode::hasConstantOrUndefValue() const {
LandingPadInst::LandingPadInst(Type *RetTy, unsigned NumReservedValues,
const Twine &NameStr,
InsertPosition InsertBefore)
- : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertBefore) {
+ : Instruction(RetTy, Instruction::LandingPad, AllocMarker, InsertBefore) {
init(NumReservedValues, NameStr);
}
LandingPadInst::LandingPadInst(const LandingPadInst &LP)
- : Instruction(LP.getType(), Instruction::LandingPad, nullptr,
- LP.getNumOperands()),
+ : Instruction(LP.getType(), Instruction::LandingPad, AllocMarker),
ReservedSpace(LP.getNumOperands()) {
+ NumUserOperands = LP.getNumOperands();
allocHungoffUses(LP.getNumOperands());
Use *OL = getOperandList();
const Use *InOL = LP.getOperandList();
@@ -716,16 +717,16 @@ void CallInst::init(FunctionType *FTy, Value *Func, const Twine &NameStr) {
}
CallInst::CallInst(FunctionType *Ty, Value *Func, const Twine &Name,
- InsertPosition InsertBefore)
- : CallBase(Ty->getReturnType(), Instruction::Call,
- OperandTraits<CallBase>::op_end(this) - 1, 1, InsertBefore) {
+ AllocInfo AllocInfo, InsertPosition InsertBefore)
+ : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo,
+ InsertBefore) {
init(Ty, Func, Name);
}
-CallInst::CallInst(const CallInst &CI)
- : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call,
- OperandTraits<CallBase>::op_end(this) - CI.getNumOperands(),
- CI.getNumOperands()) {
+CallInst::CallInst(const CallInst &CI, AllocInfo AllocInfo)
+ : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, AllocInfo) {
+ assert(getNumOperands() == CI.getNumOperands() &&
+ "Wrong number of operands allocated");
setTailCallKind(CI.getTailCallKind());
setCallingConv(CI.getCallingConv());
@@ -774,7 +775,7 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
const Twine &NameStr) {
this->FTy = FTy;
- assert((int)getNumOperands() ==
+ assert(getNumOperands() ==
ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)) &&
"NumOperands not set up?");
@@ -803,10 +804,10 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal,
setName(NameStr);
}
-InvokeInst::InvokeInst(const InvokeInst &II)
- : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke,
- OperandTraits<CallBase>::op_end(this) - II.getNumOperands(),
- II.getNumOperands()) {
+InvokeInst::InvokeInst(const InvokeInst &II, AllocInfo AllocInfo)
+ : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, AllocInfo) {
+ assert(getNumOperands() == II.getNumOperands() &&
+ "Wrong number of operands allocated");
setCallingConv(II.getCallingConv());
std::copy(II.op_begin(), II.op_end(), op_begin());
std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
@@ -855,9 +856,9 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough,
const Twine &NameStr) {
this->FTy = FTy;
- assert((int)getNumOperands() ==
- ComputeNumOperands(Args.size(), IndirectDests.size(),
- CountBundleInputs(Bundles)) &&
+ assert(getNumOperands() == ComputeNumOperands(Args.size(),
+ IndirectDests.size(),
+ CountBundleInputs(Bundles)) &&
"NumOperands not set up?");
#ifndef NDEBUG
@@ -887,10 +888,11 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough,
setName(NameStr);
}
-CallBrInst::CallBrInst(const CallBrInst &CBI)
+CallBrInst::CallBrInst(const CallBrInst &CBI, AllocInfo AllocInfo)
: CallBase(CBI.Attrs, CBI.FTy, CBI.getType(), Instruction::CallBr,
- OperandTraits<CallBase>::op_end(this) - CBI.getNumOperands(),
- CBI.getNumOperands()) {
+ AllocInfo) {
+ assert(getNumOperands() == CBI.getNumOperands() &&
+ "Wrong number of operands allocated");
setCallingConv(CBI.getCallingConv());
std::copy(CBI.op_begin(), CBI.op_end(), op_begin());
std::copy(CBI.bundle_op_info_begin(), CBI.bundle_op_info_end(),
@@ -918,19 +920,19 @@ CallBrInst *CallBrInst::Create(CallBrInst *CBI, ArrayRef<OperandBundleDef> OpB,
// ReturnInst Implementation
//===----------------------------------------------------------------------===//
-ReturnInst::ReturnInst(const ReturnInst &RI)
+ReturnInst::ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo)
: Instruction(Type::getVoidTy(RI.getContext()), Instruction::Ret,
- OperandTraits<ReturnInst>::op_end(this) - RI.getNumOperands(),
- RI.getNumOperands()) {
+ AllocInfo) {
+ assert(getNumOperands() == RI.getNumOperands() &&
+ "Wrong number of operands allocated");
if (RI.getNumOperands())
Op<0>() = RI.Op<0>();
SubclassOptionalData = RI.SubclassOptionalData;
}
-ReturnInst::ReturnInst(LLVMContext &C, Value *retVal,
+ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo,
InsertPosition InsertBefore)
- : Instruction(Type::getVoidTy(C), Instruction::Ret,
- OperandTraits<ReturnInst>::op_end(this) - !!retVal, !!retVal,
+ : Instruction(Type::getVoidTy(C), Instruction::Ret, AllocInfo,
InsertBefore) {
if (retVal)
Op<0>() = retVal;
@@ -942,13 +944,13 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal,
ResumeInst::ResumeInst(const ResumeInst &RI)
: Instruction(Type::getVoidTy(RI.getContext()), Instruction::Resume,
- OperandTraits<ResumeInst>::op_begin(this), 1) {
+ AllocMarker) {
Op<0>() = RI.Op<0>();
}
ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore)
: Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
- OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
+ AllocMarker, InsertBefore) {
Op<0>() = Exn;
}
@@ -956,11 +958,11 @@ ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore)
// CleanupReturnInst Implementation
//===----------------------------------------------------------------------===//
-CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI)
- : Instruction(CRI.getType(), Instruction::CleanupRet,
- OperandTraits<CleanupReturnInst>::op_end(this) -
- CRI.getNumOperands(),
- CRI.getNumOperands()) {
+CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI,
+ AllocInfo AllocInfo)
+ : Instruction(CRI.getType(), Instruction::CleanupRet, AllocInfo) {
+ assert(getNumOperands() == CRI.getNumOperands() &&
+ "Wrong number of operands allocated");
setSubclassData<Instruction::OpaqueField>(
CRI.getSubclassData<Instruction::OpaqueField>());
Op<0>() = CRI.Op<0>();
@@ -978,12 +980,10 @@ void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) {
}
CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB,
- unsigned Values,
+ AllocInfo AllocInfo,
InsertPosition InsertBefore)
: Instruction(Type::getVoidTy(CleanupPad->getContext()),
- Instruction::CleanupRet,
- OperandTraits<CleanupReturnInst>::op_end(this) - Values,
- Values, InsertBefore) {
+ Instruction::CleanupRet, AllocInfo, InsertBefore) {
init(CleanupPad, UnwindBB);
}
@@ -997,7 +997,7 @@ void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) {
CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
: Instruction(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet,
- OperandTraits<CatchReturnInst>::op_begin(this), 2) {
+ AllocMarker) {
Op<0>() = CRI.Op<0>();
Op<1>() = CRI.Op<1>();
}
@@ -1005,8 +1005,7 @@ CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI)
CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB,
InsertPosition InsertBefore)
: Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet,
- OperandTraits<CatchReturnInst>::op_begin(this), 2,
- InsertBefore) {
+ AllocMarker, InsertBefore) {
init(CatchPad, BB);
}
@@ -1018,7 +1017,7 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
unsigned NumReservedValues,
const Twine &NameStr,
InsertPosition InsertBefore)
- : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0,
+ : Instruction(ParentPad->getType(), Instruction::CatchSwitch, AllocMarker,
InsertBefore) {
if (UnwindDest)
++NumReservedValues;
@@ -1027,8 +1026,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest,
}
CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI)
- : Instruction(CSI.getType(), Instruction::CatchSwitch, nullptr,
- CSI.getNumOperands()) {
+ : Instruction(CSI.getType(), Instruction::CatchSwitch, AllocMarker) {
+ NumUserOperands = CSI.NumUserOperands;
init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands());
setNumHungOffUseOperands(ReservedSpace);
Use *OL = getOperandList();
@@ -1093,22 +1092,19 @@ void FuncletPadInst::init(Value *ParentPad, ArrayRef<Value *> Args,
setName(NameStr);
}
-FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI)
- : Instruction(FPI.getType(), FPI.getOpcode(),
- OperandTraits<FuncletPadInst>::op_end(this) -
- FPI.getNumOperands(),
- FPI.getNumOperands()) {
+FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI, AllocInfo AllocInfo)
+ : Instruction(FPI.getType(), FPI.getOpcode(), AllocInfo) {
+ assert(getNumOperands() == FPI.getNumOperands() &&
+ "Wrong number of operands allocated");
std::copy(FPI.op_begin(), FPI.op_end(), op_begin());
setParentPad(FPI.getParentPad());
}
FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
- ArrayRef<Value *> Args, unsigned Values,
+ ArrayRef<Value *> Args, AllocInfo AllocInfo,
const Twine &NameStr,
InsertPosition InsertBefore)
- : Instruction(ParentPad->getType(), Op,
- OperandTraits<FuncletPadInst>::op_end(this) - Values, Values,
- InsertBefore) {
+ : Instruction(ParentPad->getType(), Op, AllocInfo, InsertBefore) {
init(ParentPad, Args, NameStr);
}
@@ -1118,8 +1114,8 @@ FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad,
UnreachableInst::UnreachableInst(LLVMContext &Context,
InsertPosition InsertBefore)
- : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
- 0, InsertBefore) {}
+ : Instruction(Type::getVoidTy(Context), Instruction::Unreachable,
+ AllocMarker, InsertBefore) {}
//===----------------------------------------------------------------------===//
// BranchInst Implementation
@@ -1131,19 +1127,18 @@ void BranchInst::AssertOK() {
"May only branch on boolean predicates!");
}
-BranchInst::BranchInst(BasicBlock *IfTrue, InsertPosition InsertBefore)
+BranchInst::BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo,
+ InsertPosition InsertBefore)
: Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
- OperandTraits<BranchInst>::op_end(this) - 1, 1,
- InsertBefore) {
+ AllocInfo, InsertBefore) {
assert(IfTrue && "Branch destination may not be null!");
Op<-1>() = IfTrue;
}
BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
- InsertPosition InsertBefore)
+ AllocInfo AllocInfo, InsertPosition InsertBefore)
: Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br,
- OperandTraits<BranchInst>::op_end(this) - 3, 3,
- InsertBefore) {
+ AllocInfo, InsertBefore) {
// Assign in order of operand index to make use-list order predictable.
Op<-3>() = Cond;
Op<-2>() = IfFalse;
@@ -1153,10 +1148,11 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond,
#endif
}
-BranchInst::BranchInst(const BranchInst &BI)
+BranchInst::BranchInst(const BranchInst &BI, AllocInfo AllocInfo)
: Instruction(Type::getVoidTy(BI.getContext()), Instruction::Br,
- OperandTraits<BranchInst>::op_end(this) - BI.getNumOperands(),
- BI.getNumOperands()) {
+ AllocInfo) {
+ assert(getNumOperands() == BI.getNumOperands() &&
+ "Wrong number of operands allocated");
// Assign in order of operand index to make use-list order predictable.
if (BI.getNumOperands() != 1) {
assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!");
@@ -1313,9 +1309,8 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
AtomicOrdering Order, SyncScope::ID SSID,
InsertPosition InsertBefore)
- : Instruction(Type::getVoidTy(val->getContext()), Store,
- OperandTraits<StoreInst>::op_begin(this),
- OperandTraits<StoreInst>::operands(this), InsertBefore) {
+ : Instruction(Type::getVoidTy(val->getContext()), Store, AllocMarker,
+ InsertBefore) {
Op<0>() = val;
Op<1>() = addr;
setVolatile(isVolatile);
@@ -1356,8 +1351,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
InsertPosition InsertBefore)
: Instruction(
StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
- AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
- OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
+ AtomicCmpXchg, AllocMarker, InsertBefore) {
Init(Ptr, Cmp, NewVal, Alignment, SuccessOrdering, FailureOrdering, SSID);
}
@@ -1389,9 +1383,7 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
Align Alignment, AtomicOrdering Ordering,
SyncScope::ID SSID, InsertPosition InsertBefore)
- : Instruction(Val->getType(), AtomicRMW,
- OperandTraits<AtomicRMWInst>::op_begin(this),
- OperandTraits<AtomicRMWInst>::operands(this), InsertBefore) {
+ : Instruction(Val->getType(), AtomicRMW, AllocMarker, InsertBefore) {
Init(Operation, Ptr, Val, Alignment, Ordering, SSID);
}
@@ -1448,7 +1440,7 @@ StringRef AtomicRMWInst::getOperationName(BinOp Op) {
FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering,
SyncScope::ID SSID, InsertPosition InsertBefore)
- : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) {
+ : Instruction(Type::getVoidTy(C), Fence, AllocMarker, InsertBefore) {
setOrdering(Ordering);
setSyncScopeID(SSID);
}
@@ -1466,13 +1458,13 @@ void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
setName(Name);
}
-GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI)
- : Instruction(GEPI.getType(), GetElementPtr,
- OperandTraits<GetElementPtrInst>::op_end(this) -
- GEPI.getNumOperands(),
- GEPI.getNumOperands()),
+GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI,
+ AllocInfo AllocInfo)
+ : Instruction(GEPI.getType(), GetElementPtr, AllocInfo),
SourceElementType(GEPI.SourceElementType),
ResultElementType(GEPI.ResultElementType) {
+ assert(getNumOperands() == GEPI.getNumOperands() &&
+ "Wrong number of operands allocated");
std::copy(GEPI.op_begin(), GEPI.op_end(), op_begin());
SubclassOptionalData = GEPI.SubclassOptionalData;
}
@@ -1606,9 +1598,8 @@ bool GetElementPtrInst::collectOffset(
ExtractElementInst::ExtractElementInst(Value *Val, Value *Index,
const Twine &Name,
InsertPosition InsertBef)
- : Instruction(
- cast<VectorType>(Val->getType())->getElementType(), ExtractElement,
- OperandTraits<ExtractElementInst>::op_begin(this), 2, InsertBef) {
+ : Instruction(cast<VectorType>(Val->getType())->getElementType(),
+ ExtractElement, AllocMarker, InsertBef) {
assert(isValidOperands(Val, Index) &&
"Invalid extractelement instruction operands!");
Op<0>() = Val;
@@ -1629,9 +1620,7 @@ bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) {
InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index,
const Twine &Name,
InsertPosition InsertBef)
- : Instruction(Vec->getType(), InsertElement,
- OperandTraits<InsertElementInst>::op_begin(this), 3,
- InsertBef) {
+ : Instruction(Vec->getType(), InsertElement, AllocMarker, InsertBef) {
assert(isValidOperands(Vec, Elt, Index) &&
"Invalid insertelement instruction operands!");
Op<0>() = Vec;
@@ -1679,8 +1668,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
: Instruction(
VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
cast<VectorType>(Mask->getType())->getElementCount()),
- ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
- OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
+ ShuffleVector, AllocMarker, InsertBefore) {
assert(isValidOperands(V1, V2, Mask) &&
"Invalid shuffle vector instruction operands!");
@@ -1698,8 +1686,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
: Instruction(
VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
Mask.size(), isa<ScalableVectorType>(V1->getType())),
- ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
- OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
+ ShuffleVector, AllocMarker, InsertBefore) {
assert(isValidOperands(V1, V2, Mask) &&
"Invalid shuffle vector instruction operands!");
Op<0>() = V1;
@@ -2464,9 +2451,8 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
}
InsertValueInst::InsertValueInst(const InsertValueInst &IVI)
- : Instruction(IVI.getType(), InsertValue,
- OperandTraits<InsertValueInst>::op_begin(this), 2),
- Indices(IVI.Indices) {
+ : Instruction(IVI.getType(), InsertValue, AllocMarker),
+ Indices(IVI.Indices) {
Op<0>() = IVI.getOperand(0);
Op<1>() = IVI.getOperand(1);
SubclassOptionalData = IVI.SubclassOptionalData;
@@ -2565,8 +2551,7 @@ void UnaryOperator::AssertOK() {
BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
const Twine &Name, InsertPosition InsertBefore)
- : Instruction(Ty, iType, OperandTraits<BinaryOperator>::op_begin(this),
- OperandTraits<BinaryOperator>::operands(this), InsertBefore) {
+ : Instruction(Ty, iType, AllocMarker, InsertBefore) {
Op<0>() = S1;
Op<1>() = S2;
setName(Name);
@@ -3427,8 +3412,7 @@ AddrSpaceCastInst::AddrSpaceCastInst(Value *S, Type *Ty, const Twine &Name,
CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS,
Value *RHS, const Twine &Name, InsertPosition InsertBefore,
Instruction *FlagsSource)
- : Instruction(ty, op, OperandTraits<CmpInst>::op_begin(this),
- OperandTraits<CmpInst>::operands(this), InsertBefore) {
+ : Instruction(ty, op, AllocMarker, InsertBefore) {
Op<0>() = LHS;
Op<1>() = RHS;
setPredicate((Predicate)predicate);
@@ -3918,12 +3902,12 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) {
SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases,
InsertPosition InsertBefore)
: Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch,
- nullptr, 0, InsertBefore) {
+ AllocMarker, InsertBefore) {
init(Value, Default, 2+NumCases*2);
}
SwitchInst::SwitchInst(const SwitchInst &SI)
- : Instruction(SI.getType(), Instruction::Switch, nullptr, 0) {
+ : Instruction(SI.getType(), Instruction::Switch, AllocMarker) {
init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands());
setNumHungOffUseOperands(SI.getNumOperands());
Use *OL = getOperandList();
@@ -4125,13 +4109,14 @@ void IndirectBrInst::growOperands() {
IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases,
InsertPosition InsertBefore)
: Instruction(Type::getVoidTy(Address->getContext()),
- Instruction::IndirectBr, nullptr, 0, InsertBefore) {
+ Instruction::IndirectBr, AllocMarker, InsertBefore) {
init(Address, NumCases);
}
IndirectBrInst::IndirectBrInst(const IndirectBrInst &IBI)
: Instruction(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr,
- nullptr, IBI.getNumOperands()) {
+ AllocMarker) {
+ NumUserOperands = IBI.NumUserOperands;
allocHungoffUses(IBI.getNumOperands());
Use *OL = getOperandList();
const Use *InOL = IBI.getOperandList();
@@ -4185,7 +4170,8 @@ FreezeInst::FreezeInst(Value *S, const Twine &Name, InsertPosition InsertBefore)
// unit that uses these classes.
GetElementPtrInst *GetElementPtrInst::cloneImpl() const {
- return new (getNumOperands()) GetElementPtrInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) GetElementPtrInst(*this, AllocMarker);
}
UnaryOperator *UnaryOperator::cloneImpl() const {
@@ -4305,10 +4291,13 @@ AddrSpaceCastInst *AddrSpaceCastInst::cloneImpl() const {
CallInst *CallInst::cloneImpl() const {
if (hasOperandBundles()) {
- unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
- return new(getNumOperands(), DescriptorBytes) CallInst(*this);
+ IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+ getNumOperands(),
+ getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))};
+ return new (AllocMarker) CallInst(*this, AllocMarker);
}
- return new(getNumOperands()) CallInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) CallInst(*this, AllocMarker);
}
SelectInst *SelectInst::cloneImpl() const {
@@ -4331,18 +4320,20 @@ ShuffleVectorInst *ShuffleVectorInst::cloneImpl() const {
return new ShuffleVectorInst(getOperand(0), getOperand(1), getShuffleMask());
}
-PHINode *PHINode::cloneImpl() const { return new PHINode(*this); }
+PHINode *PHINode::cloneImpl() const { return new (AllocMarker) PHINode(*this); }
LandingPadInst *LandingPadInst::cloneImpl() const {
return new LandingPadInst(*this);
}
ReturnInst *ReturnInst::cloneImpl() const {
- return new(getNumOperands()) ReturnInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) ReturnInst(*this, AllocMarker);
}
BranchInst *BranchInst::cloneImpl() const {
- return new(getNumOperands()) BranchInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) BranchInst(*this, AllocMarker);
}
SwitchInst *SwitchInst::cloneImpl() const { return new SwitchInst(*this); }
@@ -4353,28 +4344,37 @@ IndirectBrInst *IndirectBrInst::cloneImpl() const {
InvokeInst *InvokeInst::cloneImpl() const {
if (hasOperandBundles()) {
- unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
- return new(getNumOperands(), DescriptorBytes) InvokeInst(*this);
+ IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+ getNumOperands(),
+ getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))};
+ return new (AllocMarker) InvokeInst(*this, AllocMarker);
}
- return new(getNumOperands()) InvokeInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) InvokeInst(*this, AllocMarker);
}
CallBrInst *CallBrInst::cloneImpl() const {
if (hasOperandBundles()) {
- unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo);
- return new (getNumOperands(), DescriptorBytes) CallBrInst(*this);
+ IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{
+ getNumOperands(),
+ getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))};
+ return new (AllocMarker) CallBrInst(*this, AllocMarker);
}
- return new (getNumOperands()) CallBrInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) CallBrInst(*this, AllocMarker);
}
-ResumeInst *ResumeInst::cloneImpl() const { return new (1) ResumeInst(*this); }
+ResumeInst *ResumeInst::cloneImpl() const {
+ return new (AllocMarker) ResumeInst(*this);
+}
CleanupReturnInst *CleanupReturnInst::cloneImpl() const {
- return new (getNumOperands()) CleanupReturnInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) CleanupReturnInst(*this, AllocMarker);
}
CatchReturnInst *CatchReturnInst::cloneImpl() const {
- return new (getNumOperands()) CatchReturnInst(*this);
+ return new (AllocMarker) CatchReturnInst(*this);
}
CatchSwitchInst *CatchSwitchInst::cloneImpl() const {
@@ -4382,7 +4382,8 @@ CatchSwitchInst *CatchSwitchInst::cloneImpl() const {
}
FuncletPadInst *FuncletPadInst::cloneImpl() const {
- return new (getNumOperands()) FuncletPadInst(*this);
+ IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()};
+ return new (AllocMarker) FuncletPadInst(*this, AllocMarker);
}
UnreachableInst *UnreachableInst::cloneImpl() const {
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index 00dd9c7..b0aa785 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -145,10 +145,7 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us,
::operator new(Size + sizeof(Use) * Us + DescBytesToAllocate));
Use *Start = reinterpret_cast<Use *>(Storage + DescBytesToAllocate);
Use *End = Start + Us;
- User *Obj = reinterpret_cast<User*>(End);
- Obj->NumUserOperands = Us;
- Obj->HasHungOffUses = false;
- Obj->HasDescriptor = DescBytes != 0;
+ User *Obj = reinterpret_cast<User *>(End);
for (; Start != End; Start++)
new (Start) Use(Obj);
@@ -160,22 +157,21 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us,
return Obj;
}
-void *User::operator new(size_t Size, unsigned Us) {
- return allocateFixedOperandUser(Size, Us, 0);
+void *User::operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait) {
+ return allocateFixedOperandUser(Size, allocTrait.NumOps, 0);
}
-void *User::operator new(size_t Size, unsigned Us, unsigned DescBytes) {
- return allocateFixedOperandUser(Size, Us, DescBytes);
+void *User::operator new(size_t Size,
+ IntrusiveOperandsAndDescriptorAllocMarker allocTrait) {
+ return allocateFixedOperandUser(Size, allocTrait.NumOps,
+ allocTrait.DescBytes);
}
-void *User::operator new(size_t Size) {
+void *User::operator new(size_t Size, HungOffOperandsAllocMarker) {
// Allocate space for a single Use*
void *Storage = ::operator new(Size + sizeof(Use *));
Use **HungOffOperandList = static_cast<Use **>(Storage);
User *Obj = reinterpret_cast<User *>(HungOffOperandList + 1);
- Obj->NumUserOperands = 0;
- Obj->HasHungOffUses = true;
- Obj->HasDescriptor = false;
*HungOffOperandList = nullptr;
return Obj;
}
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6ebf262..8f151a9 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1181,8 +1181,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
// Enable contextual profiling instrumentation.
const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled();
- const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt &&
- Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
+ const bool IsCtxProfUse =
+ !UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen ||
IsCtxProfUse)
@@ -1673,7 +1673,7 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
// In pre-link, for ctx prof use, we stop here with an instrumented IR. We let
// thinlto use the contextual info to perform imports; then use the contextual
// profile in the post-thinlink phase.
- if (!UseCtxProfile.empty() && !PGOOpt) {
+ if (!UseCtxProfile.empty()) {
addRequiredLTOPreLinkPasses(MPM);
return MPM;
}
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 18fdcda..df38395 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -662,6 +662,12 @@ Value *SelectInst::create(Value *Cond, Value *True, Value *False,
return createCommon(Cond, True, False, Name, Builder, Ctx);
}
+void SelectInst::swapValues() {
+ Ctx.getTracker().emplaceIfTracking<UseSwap>(getOperandUse(1),
+ getOperandUse(2));
+ cast<llvm::SelectInst>(Val)->swapValues();
+}
+
bool SelectInst::classof(const Value *From) {
return From->getSubclassID() == ClassID::Select;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 4da3618..9f89264 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -353,23 +353,20 @@ bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
}
/// Match an fpext from half to float, or a constant we can convert.
-static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
- if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
- return FPExtSrc->getType()->isHalfTy();
-
- ConstantFP *CFP;
- if (match(Arg, m_ConstantFP(CFP))) {
+static Value *matchFPExtFromF16(Value *Arg) {
+ Value *Src = nullptr;
+ ConstantFP *CFP = nullptr;
+ if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
+ if (Src->getType()->isHalfTy())
+ return Src;
+ } else if (match(Arg, m_ConstantFP(CFP))) {
bool LosesInfo;
APFloat Val(CFP->getValueAPF());
Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
- if (LosesInfo)
- return false;
-
- FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
- return true;
+ if (!LosesInfo)
+ return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
}
-
- return false;
+ return nullptr;
}
// Trim all zero components from the end of the vector \p UseV and return
@@ -839,15 +836,16 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (!ST->hasMed3_16())
break;
- Value *X, *Y, *Z;
-
// Repeat floating-point width reduction done for minnum/maxnum.
// fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
- if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
- matchFPExtFromF16(Src2, Z)) {
- Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
- {X, Y, Z}, &II, II.getName());
- return new FPExtInst(NewCall, II.getType());
+ if (Value *X = matchFPExtFromF16(Src0)) {
+ if (Value *Y = matchFPExtFromF16(Src1)) {
+ if (Value *Z = matchFPExtFromF16(Src2)) {
+ Value *NewCall = IC.Builder.CreateIntrinsic(
+ IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
+ return new FPExtInst(NewCall, II.getType());
+ }
+ }
}
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 56f4efd..e657f66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
+ bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+ IID == Intrinsic::amdgcn_set_inactive_chain_arg;
auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
Register Src2, LLT VT) -> Register {
@@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
@@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
- IsPermLane16) {
+ IsSetInactive || IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
@@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
if (IID == Intrinsic::amdgcn_writelane)
@@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src1Parts, Src2Parts;
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
if (IID == Intrinsic::amdgcn_writelane)
@@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1 = Src1Parts.getReg(i);
if (IID == Intrinsic::amdgcn_writelane)
@@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
return legalizeLaneOp(Helper, MI, IntrID);
case Intrinsic::amdgcn_s_buffer_prefetch_data:
return legalizeSBufferPrefetch(Helper, MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9a6d617..62e22c1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6102,6 +6102,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
unsigned IID = N->getConstantOperandVal(0);
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
+ bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+ IID == Intrinsic::amdgcn_set_inactive_chain_arg;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
@@ -6119,6 +6121,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Operands.push_back(Src2);
[[fallthrough]];
case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
Operands.push_back(Src1);
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
@@ -6145,7 +6149,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
- IsPermLane16) {
+ IsSetInactive || IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
@@ -6161,7 +6165,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);
- if (IsPermLane16) {
+ if (IsSetInactive || IsPermLane16) {
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
SL, MVT::i32);
}
@@ -6237,7 +6241,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));
@@ -6246,7 +6250,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(EltIdx, SL, MVT::i32));
Pieces.push_back(
- IsPermLane16
+ IsSetInactive || IsPermLane16
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
EltIdx += 2;
@@ -6262,7 +6266,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);
if (IID == Intrinsic::amdgcn_writelane)
@@ -8745,6 +8749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 899d937..e4a679f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
}
}
-Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
- assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
- MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
- for (auto &Op : MI.implicit_operands()) {
- if (Op.isDef())
- continue;
- Register OpReg = Op.getReg();
- if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
- OpReg == AMDGPU::SCC)
- continue;
- return OpReg;
- }
- return Register();
-}
-
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32:
- case AMDGPU::V_SET_INACTIVE_B64: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
- ? AMDGPU::V_MOV_B64_PSEUDO
- : AMDGPU::V_MOV_B32_e32;
- Register ExecReg = RI.getExec();
+ case AMDGPU::V_SET_INACTIVE_B32: {
+ // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
Register DstReg = MI.getOperand(0).getReg();
- MachineOperand &ActiveSrc = MI.getOperand(1);
- MachineOperand &InactiveSrc = MI.getOperand(2);
-
- // Find implicit register defining lanes active outside WWM.
- Register ExecSrcReg = findSetInactiveMask(MI);
- assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
- // Note: default here is set to ExecReg so that functional MIR is still
- // generated if implicit def is not found and assertions are disabled.
- if (!ExecSrcReg)
- ExecSrcReg = ExecReg;
-
- // Ideally in WWM this operation is lowered to V_CNDMASK; however,
- // constant bus constraints and the presence of literal constants
- // present an issue.
- // Fallback to V_MOV base lowering in all but the common cases.
- const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
- MachineFunction *MF = MBB.getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
- const MCInstrDesc &Desc = get(Opcode);
-
- const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
- const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
- const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
- const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
- const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
- const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
-
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
- int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
-
- int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
- int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
- int ConstantBusUses =
- 1 + // Starts at 1 for ExecSrcReg
- (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
- (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
- int LiteralConstants =
- ((ActiveSrc.isReg() ||
- (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
- ? 0
- : 1) +
- ((InactiveSrc.isReg() ||
- (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
- ? 0
- : 1);
-
- bool UseVCndMask =
- ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
- if (VMov64 && UseVCndMask) {
- // Decomposition must not introduce new literals.
- UseVCndMask &=
- ActiveSrc.isReg() ||
- (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
- (!isInlineConstant(ActiveImm));
- UseVCndMask &= InactiveSrc.isReg() ||
- (isInlineConstant(InactiveImmLo) &&
- isInlineConstant(InactiveImmHi)) ||
- (!isInlineConstant(InactiveImm));
- }
-
- if (UseVCndMask && VMov64) {
- // Dual V_CNDMASK_B32
- MachineOperand ActiveLo = buildExtractSubRegOrImm(
- MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
- MachineOperand ActiveHi = buildExtractSubRegOrImm(
- MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
- MachineOperand InactiveLo = buildExtractSubRegOrImm(
- MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
- MachineOperand InactiveHi = buildExtractSubRegOrImm(
- MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
- if (ActiveSrc.isReg())
- ActiveHi.setIsKill(ActiveSrc.isKill());
- if (InactiveSrc.isReg())
- InactiveHi.setIsKill(InactiveSrc.isKill());
- BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
- .addImm(0)
- .add(InactiveLo)
- .addImm(0)
- .add(ActiveLo)
- .addReg(ExecSrcReg)
- .addReg(DstReg, RegState::ImplicitDefine);
- BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
- .addImm(0)
- .add(InactiveHi)
- .addImm(0)
- .add(ActiveHi)
- .addReg(ExecSrcReg)
- .addReg(DstReg, RegState::ImplicitDefine);
- } else if (UseVCndMask) {
- // Single V_CNDMASK_B32
- BuildMI(MBB, MI, DL, Desc, DstReg)
- .addImm(0)
- .add(InactiveSrc)
- .addImm(0)
- .add(ActiveSrc)
- .addReg(ExecSrcReg);
- } else {
- // Fallback V_MOV case.
- // Avoid unnecessary work if a source VGPR is also the destination.
- // This can happen if WWM register allocation was efficient.
- // Note: this assumes WWM execution.
- bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
- bool DstIsInactive =
- InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
- if (!DstIsInactive) {
- // Set exec mask to inactive lanes,
- // but only if active lanes would be overwritten.
- if (DstIsActive) {
- BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
- .addReg(ExecSrcReg)
- .setOperandDead(3); // Dead scc
- }
- // Copy inactive lanes
- MachineInstr *VMov =
- BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
- if (VMov64)
- expandPostRAPseudo(*VMov);
- }
- if (!DstIsActive) {
- // Set exec mask to active lanes
- BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
- // Copy active lanes
- MachineInstr *VMov =
- BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
- .add(ActiveSrc);
- if (VMov64)
- expandPostRAPseudo(*VMov);
- }
- // Restore WWM
- BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
- }
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(5));
MI.eraseFromParent();
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7143251..4fd9b43 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,8 +1437,6 @@ public:
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
-
- static Register findSetInactiveMask(const MachineInstr &MI);
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index f9f0796..284be72 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
-let Defs = [SCC], isConvergent = 1 in {
-def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
-
-def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
-} // End Defs = [SCC]
+let isConvergent = 1 in
+def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>;
foreach vt = Reg32Types.types in {
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
- (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
-}
-
-foreach vt = Reg64Types.types in {
-def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
- (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+ (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>;
}
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
- (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
-
-def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
- (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
+ (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 29fef49..3bf2ea0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock *MBB : RPOT) {
bool InWWM = false;
for (MachineInstr &MI : *MBB) {
- if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
- MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+ if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
RegsAssigned |= processDef(MI.getOperand(0));
if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 38ebda6..8cedc34 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -557,26 +557,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// This avoid unnecessarily marking M0 as requiring WQM.
III.Needs |= StateStrictWQM;
GlobalFlags |= StateStrictWQM;
- } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
- Opcode == AMDGPU::V_SET_INACTIVE_B64) {
- // Ignore these if V_SET_INACTIVE which already has exec src register.
- // These are generated by an earlier pass which has seperately ensured
- // WWM and provided a mask of inactive lanes.
- Register ExecSrc = TII->findSetInactiveMask(MI);
- if (!ExecSrc) {
- // Disable strict states; StrictWQM will be added as required later.
- III.Disabled = StateStrict;
- MachineOperand &Inactive = MI.getOperand(2);
- if (Inactive.isReg()) {
- if (Inactive.isUndef()) {
- LowerToCopyInstrs.insert(&MI);
- } else {
- markOperand(MI, Inactive, StateStrictWWM, Worklist);
- }
- }
- SetInactiveInstrs.push_back(&MI);
- BBI.NeedsLowering = true;
+ } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
+ // Disable strict states; StrictWQM will be added as required later.
+ III.Disabled = StateStrict;
+ MachineOperand &Inactive = MI.getOperand(4);
+ if (Inactive.isReg()) {
+ if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
+ LowerToCopyInstrs.insert(&MI);
+ else
+ markOperand(MI, Inactive, StateStrictWWM, Worklist);
}
+ SetInactiveInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
@@ -1078,10 +1070,11 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
ActiveLanesReg = 0;
break;
case AMDGPU::V_SET_INACTIVE_B32:
- case AMDGPU::V_SET_INACTIVE_B64:
if (ActiveLanesReg) {
- MI.addOperand(*MBB.getParent(),
- MachineOperand::CreateReg(ActiveLanesReg, false, true));
+ LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
+ MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
+ MI.getOperand(5).setReg(ActiveLanesReg);
+ LIS->shrinkToUses(&LI);
} else {
assert(State == StateExact || State == StateWQM);
}
@@ -1527,15 +1520,17 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs) {
LLVM_DEBUG(dbgs() << "simplify: " << *MI);
- if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
- MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
- assert(MI->getNumExplicitOperands() == 3);
+ if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
+ assert(MI->getNumExplicitOperands() == 6);
LiveInterval *RecomputeLI = nullptr;
- if (MI->getOperand(2).isReg())
- RecomputeLI = &LIS->getInterval(MI->getOperand(2).getReg());
+ if (MI->getOperand(4).isReg())
+ RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
- MI->removeOperand(2);
+ MI->removeOperand(5);
+ MI->removeOperand(4);
+ MI->removeOperand(3);
+ MI->removeOperand(1);
if (RecomputeLI)
LIS->shrinkToUses(RecomputeLI);
@@ -1547,12 +1542,6 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
? (unsigned)AMDGPU::COPY
: TII->getMovOpcode(TRI->getRegClassForOperandReg(
*MRI, MI->getOperand(0)));
- int Index = MI->findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- while (Index >= 0) {
- MI->removeOperand(Index);
- Index = MI->findRegisterUseOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
- }
-
MI->setDesc(TII->get(CopyOp));
LLVM_DEBUG(dbgs() << " -> " << *MI);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 1205ad4..082546c4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -139,16 +139,21 @@
#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include <numeric>
#include <queue>
@@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
// pointer in parameter AS.
// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
// generic using cvta.param.
-static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
+ bool IsGridConstant) {
Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
assert(I && "OldUse must be in an instruction");
struct IP {
@@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
SmallVector<Instruction *> InstructionsToDelete;
- auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
+ auto CloneInstInParamAS = [HasCvtaParam,
+ IsGridConstant](const IP &I) -> Value * {
if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
LI->setOperand(0, I.NewParam);
return LI;
@@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
// Just pass through the argument, the old ASC is no longer needed.
return I.NewParam;
}
+ if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
+ if (MI->getRawSource() == I.OldUse->get()) {
+ // convert to memcpy/memmove from param space.
+ IRBuilder<> Builder(I.OldInstruction);
+ Intrinsic::ID ID = MI->getIntrinsicID();
+
+ CallInst *B = Builder.CreateMemTransferInst(
+ ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
+ MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+ for (unsigned I : {0, 1})
+ if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
+ B->addDereferenceableParamAttr(I, Bytes);
+ return B;
+ }
+ // We may be able to handle other cases if the argument is
+ // __grid_constant__
+ }
- if (GridConstant) {
+ if (HasCvtaParam) {
auto GetParamAddrCastToGeneric =
[](Value *Addr, Instruction *OriginalUser) -> Value * {
PointerType *ReturnTy =
@@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
OriginalUser->getIterator());
return CvtToGenCall;
};
-
- if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
- I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
- return CI;
+ auto *ParamInGenericAS =
+ GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
+
+ // phi/select could use generic arg pointers w/o __grid_constant__
+ if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
+ for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+ if (V.get() == I.OldUse->get())
+ PHI->setIncomingValue(Idx, ParamInGenericAS);
+ }
}
- if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
- // byval address is being stored, cast it to generic
- if (SI->getValueOperand() == I.OldUse->get())
- SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
- return SI;
+ if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
+ if (SI->getTrueValue() == I.OldUse->get())
+ SI->setTrueValue(ParamInGenericAS);
+ if (SI->getFalseValue() == I.OldUse->get())
+ SI->setFalseValue(ParamInGenericAS);
}
- if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
- if (PI->getPointerOperand() == I.OldUse->get())
- PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
- return PI;
+
+ // Escapes or writes can only use generic param pointers if
+ // __grid_constant__ is in effect.
+ if (IsGridConstant) {
+ if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+ I.OldUse->set(ParamInGenericAS);
+ return CI;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+ // byval address is being stored, cast it to generic
+ if (SI->getValueOperand() == I.OldUse->get())
+ SI->setOperand(0, ParamInGenericAS);
+ return SI;
+ }
+ if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+ if (PI->getPointerOperand() == I.OldUse->get())
+ PI->setOperand(0, ParamInGenericAS);
+ return PI;
+ }
+ // TODO: iIf we allow stores, we should allow memcpy/memset to
+ // parameter, too.
}
- llvm_unreachable(
- "Instruction unsupported even for grid_constant argument");
}
llvm_unreachable("Unsupported instruction");
@@ -409,49 +453,110 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
+namespace {
+struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
+ using Base = PtrUseVisitor<ArgUseChecker>;
+
+ bool IsGridConstant;
+ // Set of phi/select instructions using the Arg
+ SmallPtrSet<Instruction *, 4> Conditionals;
+
+ ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
+ : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
+
+ PtrInfo visitArgPtr(Argument &A) {
+ assert(A.getType()->isPointerTy());
+ IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
+ IsOffsetKnown = false;
+ Offset = APInt(IntIdxTy->getBitWidth(), 0);
+ PI.reset();
+ Conditionals.clear();
+
+ LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
+ // Enqueue the uses of this pointer.
+ enqueueUsers(A);
+
+ // Visit all the uses off the worklist until it is empty.
+ // Note that unlike PtrUseVisitor we intentionally do not track offsets.
+ // We're only interested in how we use the pointer.
+ while (!(Worklist.empty() || PI.isAborted())) {
+ UseToVisit ToVisit = Worklist.pop_back_val();
+ U = ToVisit.UseAndIsOffsetKnown.getPointer();
+ Instruction *I = cast<Instruction>(U->getUser());
+ if (isa<PHINode>(I) || isa<SelectInst>(I))
+ Conditionals.insert(I);
+ LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
+ Base::visit(I);
+ }
+ if (PI.isEscaped())
+ LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
+ << "\n");
+ else if (PI.isAborted())
+ LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size()
+ << " conditionals\n");
+ return PI;
+ }
+
+ void visitStoreInst(StoreInst &SI) {
+ // Storing the pointer escapes it.
+ if (U->get() == SI.getValueOperand())
+ return PI.setEscapedAndAborted(&SI);
+ // Writes to the pointer are UB w/ __grid_constant__, but do not force a
+ // copy.
+ if (!IsGridConstant)
+ return PI.setAborted(&SI);
+ }
+
+ void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+ // ASC to param space are no-ops and do not need a copy
+ if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)
+ return PI.setEscapedAndAborted(&ASC);
+ Base::visitAddrSpaceCastInst(ASC);
+ }
+
+ void visitPtrToIntInst(PtrToIntInst &I) {
+ if (IsGridConstant)
+ return;
+ Base::visitPtrToIntInst(I);
+ }
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ }
+ // PHI and select just pass through the pointers.
+ void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
+ void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
+
+ void visitMemTransferInst(MemTransferInst &II) {
+ if (*U == II.getRawDest() && !IsGridConstant)
+ PI.setAborted(&II);
+ // memcpy/memmove are OK when the pointer is source. We can convert them to
+ // AS-specific memcpy.
+ }
+
+ void visitMemSetInst(MemSetInst &II) {
+ if (!IsGridConstant)
+ PI.setAborted(&II);
+ }
+}; // struct ArgUseChecker
+} // namespace
+
void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Argument *Arg) {
- bool IsGridConstant = isParamGridConstant(*Arg);
Function *Func = Arg->getParent();
+ bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+ bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+ const DataLayout &DL = Func->getDataLayout();
BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
Type *StructType = Arg->getParamByValType();
assert(StructType && "Missing byval type");
- auto AreSupportedUsers = [&](Value *Start) {
- SmallVector<Value *, 16> ValuesToCheck = {Start};
- auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
- if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
- return true;
- // ASC to param space are OK, too -- we'll just strip them.
- if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
- if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
- return true;
- }
- // Simple calls and stores are supported for grid_constants
- // writes to these pointers are undefined behaviour
- if (IsGridConstant &&
- (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
- return true;
- return false;
- };
-
- while (!ValuesToCheck.empty()) {
- Value *V = ValuesToCheck.pop_back_val();
- if (!IsSupportedUse(V)) {
- LLVM_DEBUG(dbgs() << "Need a "
- << (isParamGridConstant(*Arg) ? "cast " : "copy ")
- << "of " << *Arg << " because of " << *V << "\n");
- (void)Arg;
- return false;
- }
- if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
- !isa<PtrToIntInst>(V))
- llvm::append_range(ValuesToCheck, V->users());
- }
- return true;
- };
-
- if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
+ ArgUseChecker AUC(DL, IsGridConstant);
+ ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
+ bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());
+ // Easy case, accessing parameter directly is fine.
+ if (ArgUseIsReadOnly && AUC.Conditionals.empty()) {
// Convert all loads and intermediate operations to use parameter AS and
// skip creation of a local copy of the argument.
SmallVector<Use *, 16> UsesToUpdate;
@@ -462,7 +567,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
for (Use *U : UsesToUpdate)
- convertToParamAS(U, ArgInParamAS, IsGridConstant);
+ convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
const auto *TLI =
@@ -473,13 +578,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
return;
}
- const DataLayout &DL = Func->getDataLayout();
+ // We can't access byval arg directly and need a pointer. on sm_70+ we have
+ // ability to take a pointer to the argument without making a local copy.
+ // However, we're still not allowed to write to it. If the user specified
+ // `__grid_constant__` for the argument, we'll consider escaped pointer as
+ // read-only.
unsigned AS = DL.getAllocaAddrSpace();
- if (isParamGridConstant(*Arg)) {
- // Writes to a grid constant are undefined behaviour. We do not need a
- // temporary copy. When a pointer might have escaped, conservatively replace
- // all of its uses (which might include a device function call) with a cast
- // to the generic address space.
+ if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) {
+ LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
+ // Replace all argument pointer uses (which might include a device function
+ // call) with a cast to the generic address space using cvta.param
+ // instruction, which avoids a local copy.
IRBuilder<> IRB(&Func->getEntryBlock().front());
// Cast argument to param address space
@@ -500,6 +609,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
// Do not replace Arg in the cast to param space
CastToParam->setOperand(0, Arg);
} else {
+ LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
// Otherwise we have to create a temporary copy.
AllocaInst *AllocA =
new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 0591782..457f10f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -94,6 +94,7 @@ public:
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+ bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
// GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d2e6257..4554163 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1106,18 +1106,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (!isTypeLegal(VT))
continue;
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
- setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
Custom);
+ setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
+ setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
+ Custom);
+ setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
ISD::EXTRACT_SUBVECTOR},
VT, Custom);
- setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
if (Subtarget.hasStdExtZfbfmin())
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
- setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
- Custom);
- setOperationAction(ISD::SELECT_CC, VT, Expand);
+ setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
// TODO: Promote to fp32.
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 13212c2..02f65ff 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2918,7 +2918,7 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
// if any possible.
if (MO.getTargetFlags() == RISCVII::MO_PCREL_LO &&
(MI.getMF()->getTarget().getFunctionSections() || F.hasComdat() ||
- F.hasSection()))
+ F.hasSection() || F.getSectionPrefix()))
return outliner::InstrType::Illegal;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 5cc084f..1875a8f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -96,6 +96,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(ISD::STORE, T, Custom);
}
}
+ if (Subtarget->hasFP16()) {
+ setOperationAction(ISD::LOAD, MVT::v8f16, Custom);
+ setOperationAction(ISD::STORE, MVT::v8f16, Custom);
+ }
if (Subtarget->hasReferenceTypes()) {
// We need custom load and store lowering for both externref, funcref and
// Other. The MVT::Other here represents tables of reference types.
@@ -208,6 +212,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
MVT::v2f64})
setOperationAction(ISD::BUILD_VECTOR, T, Custom);
+ if (Subtarget->hasFP16())
+ setOperationAction(ISD::BUILD_VECTOR, MVT::f16, Custom);
+
// We have custom shuffle lowering to expose the shuffle mask
for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
MVT::v2f64})
@@ -2055,6 +2062,18 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
+ MVT VT = Op.getSimpleValueType();
+ if (VT == MVT::v8f16) {
+ // BUILD_VECTOR can't handle FP16 operands since Wasm doesn't have a scaler
+ // FP16 type, so cast them to I16s.
+ MVT IVT = VT.changeVectorElementType(MVT::i16);
+ SmallVector<SDValue, 8> NewOps;
+ for (unsigned I = 0, E = Op.getNumOperands(); I < E; ++I)
+ NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));
+ SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
+ return DAG.getBitcast(VT, Res);
+ }
+
if (auto ConvertLow = LowerConvertLow(Op, DAG))
return ConvertLow;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 9d17d90..9be23da 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -189,7 +189,7 @@ defm LOAD_V128_A64 :
}
// Def load patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = StdVecs in {
+foreach vec = AllVecs in {
defm : LoadPat<vec.vt, load, "LOAD_V128">;
}
@@ -390,7 +390,7 @@ defm STORE_V128_A64 :
}
// Def store patterns from WebAssemblyInstrMemory.td for vector types
-foreach vec = StdVecs in {
+foreach vec = AllVecs in {
defm : StorePat<vec.vt, store, "STORE_V128">;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b821da0..3b6b154 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1182,13 +1182,23 @@ public:
InstructionCost Cost) {
assert(VF.isVector() && "Expected VF >=2");
/// Broadcast this decicion to all instructions inside the group.
- /// But the cost will be assigned to one instruction only.
+ /// When interleaving, the cost will only be assigned one instruction, the
+ /// insert position. For other cases, add the appropriate fraction of the
+ /// total cost to each instruction. This ensures accurate costs are used,
+ /// even if the insert position instruction is not used.
+ InstructionCost InsertPosCost = Cost;
+ InstructionCost OtherMemberCost = 0;
+ if (W != CM_Interleave)
+ OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
+ ;
for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
if (auto *I = Grp->getMember(Idx)) {
if (Grp->getInsertPos() == I)
- WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+ WideningDecisions[std::make_pair(I, VF)] =
+ std::make_pair(W, InsertPosCost);
else
- WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+ WideningDecisions[std::make_pair(I, VF)] =
+ std::make_pair(W, OtherMemberCost);
}
}
}
@@ -8663,6 +8673,7 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
isa<VPWidenPointerInductionRecipe>(V) ||
(isa<Instruction>(IncomingValue) &&
+ OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
any_of(IncomingValue->users(), [&Inductions](User *U) {
auto *P = dyn_cast<PHINode>(U);
return P && Inductions.contains(P);
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index e9be6f5..c5d2ebf 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -28,24 +28,24 @@ PreservedAnalyses SandboxVectorizerPass::run(Function &F,
return PA;
}
-bool SandboxVectorizerPass::runImpl(Function &F) {
+bool SandboxVectorizerPass::runImpl(Function &LLVMF) {
// If the target claims to have no vector registers early return.
if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
LLVM_DEBUG(dbgs() << "SBVec: Target has no vector registers, return.\n");
return false;
}
- LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << F.getName() << ".\n");
+ LLVM_DEBUG(dbgs() << "SBVec: Analyzing " << LLVMF.getName() << ".\n");
// Early return if the attribute NoImplicitFloat is used.
- if (F.hasFnAttribute(Attribute::NoImplicitFloat)) {
+ if (LLVMF.hasFnAttribute(Attribute::NoImplicitFloat)) {
LLVM_DEBUG(dbgs() << "SBVec: NoImplicitFloat attribute, return.\n");
return false;
}
- sandboxir::Context Ctx(F.getContext());
- // Create SandboxIR for `F`.
- sandboxir::Function &SBF = *Ctx.createFunction(&F);
+ sandboxir::Context Ctx(LLVMF.getContext());
+ // Create SandboxIR for `LLVMF`.
+ sandboxir::Function &F = *Ctx.createFunction(&LLVMF);
// TODO: Initialize SBVec Pass Manager
- (void)SBF;
+ (void)F;
return false;
}
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
index 29dce5f..57ae02a 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll
@@ -82,26 +82,26 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -139,11 +139,11 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
index 0e7b1c5..2cc5150 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll
@@ -108,34 +108,34 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -180,13 +180,13 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
index 8830aff..1899741 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll
@@ -121,22 +121,22 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4
; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4
@@ -145,14 +145,14 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX2: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
@@ -203,14 +203,14 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4
-; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
index cfd3d78..2d4b300 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll
@@ -71,10 +71,10 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
index 5ec5b51..5dfb25e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll
@@ -88,12 +88,12 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
index 450743d..bd88ca8 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll
@@ -107,14 +107,14 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
index 5e5c718..9c079863 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll
@@ -72,21 +72,21 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -114,16 +114,16 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
index 62541fa2..99a735d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll
@@ -133,18 +133,18 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
index cfed855..168e916 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll
@@ -94,27 +94,27 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -152,20 +152,20 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
index 07939b9..919a17e 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll
@@ -105,30 +105,30 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8
@@ -163,22 +163,22 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
index 964a9b6..6737c72 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll
@@ -93,31 +93,31 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
;
; AVX512DQ-LABEL: 'test'
; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
index 6653198..46d56a7 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll
@@ -123,41 +123,41 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
;
; AVX512DQ-LABEL: 'test'
; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
index b3a5cbe..4d65abd 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll
@@ -138,46 +138,46 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2: LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
-; AVX2: LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2
+; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2
;
; AVX512DQ-LABEL: 'test'
; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
index c0ea210..28a6443 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll
@@ -73,8 +73,8 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
; AVX512: LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
index 2a261ca..5cad7bf 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll
@@ -82,26 +82,26 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -139,11 +139,11 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
index 8bf3071..cfb83d4 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll
@@ -108,34 +108,34 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -180,13 +180,13 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
index 3182de2..7757854 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll
@@ -121,22 +121,22 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4
; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4
@@ -145,14 +145,14 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4
; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX2: LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
@@ -203,14 +203,14 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4
-; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
index 27e2ee0..cf350cc 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll
@@ -71,10 +71,10 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
index c377232..9ca0d8c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll
@@ -88,12 +88,12 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
index 2eb7c5e..86ee6c8 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll
@@ -107,14 +107,14 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
index c11da43..f6143d4 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll
@@ -72,21 +72,21 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -114,16 +114,16 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
index de57af6..43dc53d 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll
@@ -133,18 +133,18 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
index 949c1af..70ed74dc 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll
@@ -94,27 +94,27 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -152,20 +152,20 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
index 4388ccf..401e4de 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll
@@ -105,30 +105,30 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX2: LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX2: LV: Found an estimated cost of 144 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
;
; AVX512-LABEL: 'test'
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8
@@ -163,22 +163,22 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
index 6078fb4..ef3c80c 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll
@@ -93,31 +93,31 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 325 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
;
; AVX512DQ-LABEL: 'test'
; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
index 778a4e7..8e7c316 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll
@@ -123,41 +123,41 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 455 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
;
; AVX512DQ-LABEL: 'test'
; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
index a230b5a..752cc22 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll
@@ -138,46 +138,46 @@ define void @test() {
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1
; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2: LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
-; AVX2: LV: Found an estimated cost of 520 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
-; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1
+; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1
;
; AVX512DQ-LABEL: 'test'
; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
index c1a66c1..ed2bb3f 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll
@@ -163,22 +163,22 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8
; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v0, ptr %out0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v1, ptr %out1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v2, ptr %out2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v3, ptr %out3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v4, ptr %out4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v5, ptr %out5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v6, ptr %out6, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store double %v7, ptr %out7, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v0, ptr %out0, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v1, ptr %out1, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v6, ptr %out6, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v7, ptr %out7, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v0, ptr %out0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v2, ptr %out2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v3, ptr %out3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v4, ptr %out4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v5, ptr %out5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v6, ptr %out6, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v7, ptr %out7, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
index 7be9577..a539877 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll
@@ -163,22 +163,22 @@ define void @test() {
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8
;
entry:
br label %for.body
diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
index 13a8442..41dd266 100644
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
@@ -38,8 +38,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 12 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
@@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 9c634ab..89c3bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -85,7 +85,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX908-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX908-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
- ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
; GFX908-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -159,7 +160,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
- ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+ ; GFX90A-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -233,7 +235,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
- ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+ ; GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF]], implicit $exec
; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -300,7 +303,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
- ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
+ ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec
; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index fdce9d9..8eef3d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -77,7 +77,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX90A-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
- ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec
; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -170,7 +171,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
- ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY11]], [[DEF1]], implicit $exec
; GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
@@ -255,7 +257,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
- ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
+ ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY2]], 0, [[COPY6]], [[DEF1]], implicit $exec
; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 137366a..f015099 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,15 +4,13 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -45,17 +43,15 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v2, v0
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -89,22 +85,18 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-LABEL: set_inactive_scc:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
+; GCN-NEXT: s_load_dword s8, s[2:3], 0x2c
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_buffer_load_dword s6, s[4:7], 0x0
-; GCN-NEXT: s_load_dword s7, s[2:3], 0x2c
+; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s2, 1
+; GCN-NEXT: v_mov_b32_e32 v1, s8
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3]
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s6, 56
-; GCN-NEXT: s_cselect_b32 s3, 1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_cmp_lg_u32 s4, 56
; GCN-NEXT: v_mov_b32_e32 v1, v0
-; GCN-NEXT: s_cmp_lg_u32 s3, 0
+; GCN-NEXT: s_mov_b32 s2, 1
; GCN-NEXT: s_cbranch_scc0 .LBB4_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v1
@@ -145,15 +137,16 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -174,13 +167,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-NEXT: v_mov_b32_e32 v1, 0x4010cccc
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3]
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3]
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: v_mov_b32_e32 v2, v0
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -193,15 +188,16 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -216,15 +212,16 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -240,19 +237,15 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s6, 1
-; GCN-NEXT: s_mov_b32 s7, s6
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, 1, v3, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v2, v0
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -267,19 +260,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s6, 1.0
-; GCN-NEXT: s_mov_b32 s7, s6
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, v3, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v2, v0
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -293,15 +282,16 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -318,21 +308,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s6, 0x10001
-; GCN-NEXT: s_mov_b32 s7, s6
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) #0
%tmp = call <4 x i16> @llvm.amdgcn.strict.wwm.v4i16(<4 x i16> %tmp.0)
@@ -345,21 +334,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s6, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s7, s6
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>) #0
%tmp = call <4 x half> @llvm.amdgcn.strict.wwm.v4f16(<4 x half> %tmp.0)
@@ -372,21 +360,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s6, 0x3f803f80
-; GCN-NEXT: s_mov_b32 s7, s6
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5]
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
%tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
@@ -398,17 +385,15 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[4:5]
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v3, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v2, v0
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
@@ -422,15 +407,13 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -445,15 +428,13 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -468,15 +449,13 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
@@ -491,15 +470,13 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 5a8df7b8..b17dfc7 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2890,68 +2890,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-LABEL: add_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v4, 63
; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB5_2
@@ -2970,8 +2967,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2984,68 +2981,65 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-LABEL: add_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63
; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB5_2
@@ -3064,8 +3058,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8
; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0
; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s1, v7
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -3077,23 +3071,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1064_DPP-LABEL: add_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
@@ -3184,23 +3176,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1032_DPP-LABEL: add_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
@@ -3275,31 +3265,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -3388,53 +3377,53 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
@@ -3478,31 +3467,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
-; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -3594,29 +3582,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -6611,68 +6600,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-LABEL: sub_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
; GFX8_DPP-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[4:5]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT: v_readlane_b32 s7, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s6, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s7, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s6, v4, 63
; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB11_2
@@ -6691,8 +6677,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -6705,68 +6691,65 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-LABEL: sub_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
; GFX9_DPP-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT: v_readlane_b32 s3, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s2, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s2, v4, 63
; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB11_2
@@ -6785,8 +6768,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v8
; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0
; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s1, v7
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
@@ -6798,23 +6781,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1064_DPP-LABEL: sub_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
@@ -6905,23 +6886,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1032_DPP-LABEL: sub_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
@@ -6996,31 +6975,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7109,53 +7087,53 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
; GFX1132_DPP-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX1132_DPP-NEXT: v_readlane_b32 s4, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s7, v4, 15
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s8, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s6
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s6, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
@@ -7199,31 +7177,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1264_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1264_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, 0
-; GFX1264_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX1264_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1264_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1264_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1264_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1264_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1264_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1264_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1264_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -7315,29 +7292,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1232_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1232_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1232_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1232_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s0
-; GFX1232_DPP-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v6, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s0
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1232_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
-; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1232_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1232_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1232_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1232_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 6bf03a2..988bc8e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2361,84 +2361,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: add_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63
; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: ds_add_rtn_u64 v[7:8], v9, v[7:8]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB6_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: v_add_u32_e32 v7, vcc, s5, v7
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
@@ -2450,83 +2448,81 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: add_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63
; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT: ds_add_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_add_rtn_u64 v[7:8], v9, v[7:8]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB6_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX9_DPP-NEXT: v_add_co_u32_e32 v7, vcc, s5, v7
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
@@ -2538,23 +2534,21 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: add_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
@@ -2605,57 +2599,56 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB6_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v9
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v10, vcc
+; GFX1064_DPP-NEXT: v_add_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc, s1, v12, vcc
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0
; GFX1064_DPP-NEXT: s_endpgm
;
; GFX1032_DPP-LABEL: add_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
@@ -2685,33 +2678,34 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT: ds_add_rtn_u64 v[9:10], v0, v[9:10]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB6_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v9
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo
+; GFX1032_DPP-NEXT: v_add_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0
; GFX1032_DPP-NEXT: s_endpgm
@@ -2722,31 +2716,30 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -2790,34 +2783,35 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0
-; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s0
+; GFX1164_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB6_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v8
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v9, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32 v8, vcc, s0, v10
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc, s1, v11, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1164_DPP-NEXT: s_nop 0
@@ -2830,77 +2824,77 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s4
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT: ds_add_rtn_u64 v[8:9], v0, v[8:9]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB6_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v8
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32 v8, vcc_lo, s0, v10
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1132_DPP-NEXT: s_nop 0
@@ -3163,164 +3157,158 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
;
; GFX8_DPP-LABEL: add_i64_varying_nouse:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v8
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX8_DPP-NEXT: v_readlane_b32 s3, v2, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v2, vcc, v4, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s3, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s2, v2, 63
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX8_DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s0
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_add_u64 v8, v[9:10]
+; GFX8_DPP-NEXT: ds_add_u64 v7, v[8:9]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB7_2:
; GFX8_DPP-NEXT: s_endpgm
;
; GFX9_DPP-LABEL: add_i64_varying_nouse:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v8
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v1, vcc, v3, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc
-; GFX9_DPP-NEXT: v_readlane_b32 s3, v2, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s2, v1, 63
+; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s3, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s2, v2, 63
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9_DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX9_DPP-NEXT: ds_add_u64 v8, v[9:10]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s0
+; GFX9_DPP-NEXT: ds_add_u64 v7, v[8:9]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB7_2:
; GFX9_DPP-NEXT: s_endpgm
;
; GFX1064_DPP-LABEL: add_i64_varying_nouse:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
@@ -3345,16 +3333,17 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 32
; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_add_u32 s0, s3, s4
; GFX1064_DPP-NEXT: s_addc_u32 s1, s2, s5
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT: ds_add_u64 v10, v[11:12]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT: ds_add_u64 v0, v[9:10]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB7_2:
@@ -3362,25 +3351,21 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
;
; GFX1032_DPP-LABEL: add_i64_varying_nouse:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s0
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
@@ -3398,14 +3383,15 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v11, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: ds_add_u64 v10, v[11:12]
+; GFX1032_DPP-NEXT: ds_add_u64 v0, v[9:10]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB7_2:
@@ -3417,30 +3403,29 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v6, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, v2
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v4, v1, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v5, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v4, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v3, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc, v1, v2, vcc
@@ -3460,15 +3445,16 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1164_DPP-NEXT: v_add_co_u32 v2, vcc, v2, v3
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v1, v4, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: ds_add_u64 v7, v[8:9]
+; GFX1164_DPP-NEXT: ds_add_u64 v0, v[6:7]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB7_2:
@@ -3480,49 +3466,49 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_and_b32 v6, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v6, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v7, s0
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v4, v4 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v3, vcc_lo, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v2, v3, 0, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v1, 0, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_co_u32 v2, vcc_lo, v3, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v4, vcc_lo
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v7, v3
; GFX1132_DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB7_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: ds_add_u64 v7, v[8:9]
+; GFX1132_DPP-NEXT: ds_add_u64 v0, v[6:7]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB7_2:
@@ -5909,84 +5895,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: sub_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX8_DPP-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v4, 63
; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB14_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v9, v[7:8]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB14_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v10
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v9
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX8_DPP-NEXT: v_sub_u32_e32 v7, vcc, s5, v7
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
@@ -5998,83 +5982,81 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: sub_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v3
+; GFX9_DPP-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v4, 63
; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB14_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_sub_rtn_u64 v[7:8], v9, v[7:8]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB14_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v10
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v9
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v3
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
; GFX9_DPP-NEXT: v_sub_co_u32_e32 v7, vcc, s5, v7
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
@@ -6086,23 +6068,21 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: sub_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v5
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v4, v6, vcc
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v4
+; GFX1064_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc, v5, v6, vcc
; GFX1064_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_add_co_u32 v3, vcc, v3, v7
@@ -6153,57 +6133,56 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s9, 48
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB14_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB14_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v9
-; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v10, vcc
+; GFX1064_DPP-NEXT: v_sub_co_u32 v9, vcc, s0, v11
+; GFX1064_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc, s1, v12, vcc
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0
; GFX1064_DPP-NEXT: s_endpgm
;
; GFX1032_DPP-LABEL: sub_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v4, v1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v5, 0, 0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v5
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v4
+; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v6, vcc_lo
; GFX1032_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_add_co_u32 v3, vcc_lo, v3, v7
@@ -6233,33 +6212,34 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_readlane_b32 s1, v4, 31
; GFX1032_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT: ds_sub_rtn_u64 v[9:10], v0, v[9:10]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB14_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, v1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, v2
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v9
-; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v10, vcc_lo
+; GFX1032_DPP-NEXT: v_sub_co_u32 v9, vcc_lo, s0, v11
+; GFX1032_DPP-NEXT: v_sub_co_ci_u32_e32 v10, vcc_lo, s1, v12, vcc_lo
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[9:10], off, s[4:7], 0
; GFX1032_DPP-NEXT: s_endpgm
@@ -6270,31 +6250,30 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v5, v3, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v6, vcc
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v5, vcc
-; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc
+; GFX1164_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
@@ -6338,34 +6317,35 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 48
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB14_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s0
-; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s0
+; GFX1164_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB14_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v2
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v11
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v8
-; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v9, vcc
+; GFX1164_DPP-NEXT: v_sub_co_u32 v8, vcc, s0, v10
+; GFX1164_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc, s1, v11, vcc
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1164_DPP-NEXT: s_nop 0
@@ -6378,77 +6358,77 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v8, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v3, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, 0, s4
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s4
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v4, v2
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v5, vcc_lo, v5, v5 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
+; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v5, v5 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
; GFX1132_DPP-NEXT: v_add_co_u32_e64_dpp v4, vcc_lo, v7, v4 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v6, v3, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v4, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v6 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
-; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[10:11], v9, v[10:11]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v9, s1 :: v_dual_mov_b32 v8, s0
+; GFX1132_DPP-NEXT: ds_sub_rtn_u64 v[8:9], v0, v[8:9]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB14_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v1
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v2
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v11
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v1
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v11, v2
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v8
-; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v9, vcc_lo
+; GFX1132_DPP-NEXT: v_sub_co_u32 v8, vcc_lo, s0, v10
+; GFX1132_DPP-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, s1, v11, vcc_lo
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1132_DPP-NEXT: s_nop 0
@@ -7432,129 +7412,121 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: and_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1]
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: s_mov_b32 s6, -1
+; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB16_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB16_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
-; GFX8_DPP-NEXT: s_mov_b32 s2, -1
-; GFX8_DPP-NEXT: v_and_b32_e32 v6, s4, v6
-; GFX8_DPP-NEXT: v_and_b32_e32 v5, s5, v5
+; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v6
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000
+; GFX8_DPP-NEXT: v_and_b32_e32 v6, s0, v6
+; GFX8_DPP-NEXT: v_and_b32_e32 v5, s1, v5
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[4:7], 0
; GFX8_DPP-NEXT: s_endpgm
;
; GFX9_DPP-LABEL: and_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v5, s[0:1]
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: s_mov_b32 s6, -1
+; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_and_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB16_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT: ds_and_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT: ds_and_rtn_b64 v[5:6], v7, v[5:6]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB16_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT: s_mov_b32 s2, -1
-; GFX9_DPP-NEXT: v_and_b32_e32 v6, s4, v6
-; GFX9_DPP-NEXT: v_and_b32_e32 v5, s5, v5
+; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v6
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000
+; GFX9_DPP-NEXT: v_and_b32_e32 v6, s0, v6
+; GFX9_DPP-NEXT: v_and_b32_e32 v5, s1, v5
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[0:3], 0
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[5:6], off, s[4:7], 0
; GFX9_DPP-NEXT: s_endpgm
;
; GFX1064_DPP-LABEL: and_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1]
-; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7575,63 +7547,62 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31
; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63
; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47
; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB16_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB16_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT: v_and_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT: v_and_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT: v_and_b32_e32 v8, s1, v8
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0
; GFX1064_DPP-NEXT: s_endpgm
;
; GFX1032_DPP-LABEL: and_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4
-; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, -1
+; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -7643,199 +7614,201 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1032_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB16_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT: v_and_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT: v_and_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT: v_and_b32_e32 v8, s1, v8
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0
; GFX1032_DPP-NEXT: s_endpgm
;
; GFX1164_DPP-LABEL: and_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[0:1]
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB16_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB16_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT: v_and_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT: v_and_b32_e32 v8, s1, v8
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1164_DPP-NEXT: s_nop 0
; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164_DPP-NEXT: s_endpgm
;
; GFX1132_DPP-LABEL: and_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v7, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v8, s4
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, -1, 0, s4
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, -1, v0, s4
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, -1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_and_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_and_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT: ds_and_rtn_b64 v[7:8], v0, v[7:8]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB16_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT: v_and_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT: v_and_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT: v_and_b32_e32 v8, s1, v8
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1132_DPP-NEXT: s_nop 0
; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132_DPP-NEXT: s_endpgm
@@ -8816,54 +8789,52 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: or_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB18_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB18_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s2, -1
; GFX8_DPP-NEXT: v_or_b32_e32 v6, s4, v6
@@ -8874,53 +8845,51 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: or_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_or_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_or_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB18_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT: ds_or_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT: ds_or_rtn_b64 v[5:6], v7, v[5:6]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB18_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s2, -1
; GFX9_DPP-NEXT: v_or_b32_e32 v6, s4, v6
@@ -8931,14 +8900,12 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: or_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -8959,63 +8926,62 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31
; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63
; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47
; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB18_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB18_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT: v_or_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT: v_or_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT: v_or_b32_e32 v8, s1, v8
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0
; GFX1064_DPP-NEXT: s_endpgm
;
; GFX1032_DPP-LABEL: or_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -9027,199 +8993,201 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1032_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB18_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT: v_or_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT: v_or_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT: v_or_b32_e32 v8, s1, v8
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0
; GFX1032_DPP-NEXT: s_endpgm
;
; GFX1164_DPP-LABEL: or_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB18_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB18_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT: v_or_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT: v_or_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT: v_or_b32_e32 v8, s1, v8
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1164_DPP-NEXT: s_nop 0
; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164_DPP-NEXT: s_endpgm
;
; GFX1132_DPP-LABEL: or_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_or_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_or_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT: ds_or_rtn_b64 v[7:8], v0, v[7:8]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB18_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT: v_or_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT: v_or_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT: v_or_b32_e32 v8, s1, v8
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1132_DPP-NEXT: s_nop 0
; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132_DPP-NEXT: s_endpgm
@@ -10200,54 +10168,52 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: xor_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB20_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX8_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB20_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v6
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX8_DPP-NEXT: s_mov_b32 s2, -1
; GFX8_DPP-NEXT: v_xor_b32_e32 v6, s4, v6
@@ -10258,53 +10224,51 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: xor_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v5, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v5, exec_hi, v5
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v5, s[0:1]
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v6, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_xor_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_xor_b32_dpp v3, v3, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v2, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr5_vgpr6
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB20_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
-; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v6, v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, s4
+; GFX9_DPP-NEXT: ds_xor_rtn_b64 v[5:6], v7, v[5:6]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB20_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v8
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v6
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
; GFX9_DPP-NEXT: s_mov_b32 s2, -1
; GFX9_DPP-NEXT: v_xor_b32_e32 v6, s4, v6
@@ -10315,14 +10279,12 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: xor_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1064_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10343,63 +10305,62 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 15
-; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1064_DPP-NEXT: v_readlane_b32 s6, v2, 31
-; GFX1064_DPP-NEXT: v_readlane_b32 s7, v1, 31
-; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 15
+; GFX1064_DPP-NEXT: v_readlane_b32 s6, v1, 31
+; GFX1064_DPP-NEXT: v_readlane_b32 s7, v2, 31
; GFX1064_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1064_DPP-NEXT: v_readlane_b32 s4, v2, 63
; GFX1064_DPP-NEXT: v_writelane_b32 v5, s5, 16
-; GFX1064_DPP-NEXT: v_readlane_b32 s5, v2, 63
-; GFX1064_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX1064_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1064_DPP-NEXT: v_readlane_b32 s9, v2, 47
; GFX1064_DPP-NEXT: v_writelane_b32 v6, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v5, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GFX1064_DPP-NEXT: v_writelane_b32 v6, s9, 48
-; GFX1064_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v6, s8, 48
+; GFX1064_DPP-NEXT: v_writelane_b32 v5, s9, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB20_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1064_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB20_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s0, v8
-; GFX1064_DPP-NEXT: v_xor_b32_e32 v7, s1, v7
+; GFX1064_DPP-NEXT: v_xor_b32_e32 v9, s0, v9
+; GFX1064_DPP-NEXT: v_xor_b32_e32 v8, s1, v8
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1064_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0
; GFX1064_DPP-NEXT: s_endpgm
;
; GFX1032_DPP-LABEL: xor_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
@@ -10411,199 +10372,201 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
; GFX1032_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1032_DPP-NEXT: v_readlane_b32 s6, v2, 15
-; GFX1032_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s0, v2, 31
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1032_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1032_DPP-NEXT: v_writelane_b32 v6, s6, 16
-; GFX1032_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v6, s5, 16
+; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1032_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB20_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s0, v8
-; GFX1032_DPP-NEXT: v_xor_b32_e32 v7, s1, v7
+; GFX1032_DPP-NEXT: v_xor_b32_e32 v9, s0, v9
+; GFX1032_DPP-NEXT: v_xor_b32_e32 v8, s1, v8
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
+; GFX1032_DPP-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0
; GFX1032_DPP-NEXT: s_endpgm
;
; GFX1164_DPP-LABEL: xor_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v7, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s[0:1]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s[0:1]
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
-; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 31
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 31
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, s4
; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, s5
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 15
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 15
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1164_DPP-NEXT: v_readlane_b32 s6, v2, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s7, v1, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 15
+; GFX1164_DPP-NEXT: v_readlane_b32 s6, v1, 31
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s4, 16
+; GFX1164_DPP-NEXT: v_readlane_b32 s7, v2, 31
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v2, 63
; GFX1164_DPP-NEXT: v_writelane_b32 v5, s5, 16
-; GFX1164_DPP-NEXT: v_readlane_b32 s5, v2, 63
-; GFX1164_DPP-NEXT: v_readlane_b32 s4, v1, 63
-; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v1, 63
+; GFX1164_DPP-NEXT: v_readlane_b32 s8, v1, 47
; GFX1164_DPP-NEXT: v_writelane_b32 v6, s6, 32
+; GFX1164_DPP-NEXT: v_readlane_b32 s9, v2, 47
; GFX1164_DPP-NEXT: v_writelane_b32 v5, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GFX1164_DPP-NEXT: v_writelane_b32 v6, s9, 48
-; GFX1164_DPP-NEXT: v_writelane_b32 v5, s8, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v6, s8, 48
+; GFX1164_DPP-NEXT: v_writelane_b32 v5, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB20_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0
-; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0
+; GFX1164_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB20_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s0, v8
-; GFX1164_DPP-NEXT: v_xor_b32_e32 v7, s1, v7
+; GFX1164_DPP-NEXT: v_xor_b32_e32 v9, s0, v9
+; GFX1164_DPP-NEXT: v_xor_b32_e32 v8, s1, v8
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1164_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1164_DPP-NEXT: s_nop 0
; GFX1164_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164_DPP-NEXT: s_endpgm
;
; GFX1132_DPP-LABEL: xor_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v7, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v7, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v1, 0, 0, s4
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v2, 0, v0, s4
; GFX1132_DPP-NEXT: v_mov_b32_e32 v5, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v3, v1, -1, -1
; GFX1132_DPP-NEXT: v_permlanex16_b32 v4, v2, -1, -1
-; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_xor_b32_dpp v1, v3, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1132_DPP-NEXT: v_xor_b32_dpp v2, v4, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s0, v1, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_readlane_b32 s1, v1, 31
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v2, 15
-; GFX1132_DPP-NEXT: v_readlane_b32 s1, v2, 31
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s0, v2, 31
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_readlane_b32 s5, v1, 15
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: v_writelane_b32 v6, s6, 16
-; GFX1132_DPP-NEXT: v_writelane_b32 v5, s5, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v6, s5, 16
+; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
-; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[9:10], v8, v[9:10]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v8, s1 :: v_dual_mov_b32 v7, s0
+; GFX1132_DPP-NEXT: ds_xor_rtn_b64 v[7:8], v0, v[7:8]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB20_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v7, v5
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v10
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v6
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v9
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v8
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v8, v5
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v6
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v7
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s0, v8
-; GFX1132_DPP-NEXT: v_xor_b32_e32 v7, s1, v7
+; GFX1132_DPP-NEXT: v_xor_b32_e32 v9, s0, v9
+; GFX1132_DPP-NEXT: v_xor_b32_e32 v8, s1, v8
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132_DPP-NEXT: buffer_store_b64 v[7:8], off, s[4:7], 0
+; GFX1132_DPP-NEXT: buffer_store_b64 v[8:9], off, s[4:7], 0
; GFX1132_DPP-NEXT: s_nop 0
; GFX1132_DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132_DPP-NEXT: s_endpgm
@@ -10934,11 +10897,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_mov_b64 exec, -1
-; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -10982,11 +10942,8 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, 1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_mov_b64 exec, -1
-; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -11885,20 +11842,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-LABEL: max_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
; GFX8_DPP-NEXT: s_mov_b32 s0, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX8_DPP-NEXT: s_brev_b32 s1, 1
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT: s_mov_b64 exec, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s1
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
@@ -11953,22 +11906,22 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX8_DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB23_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB23_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX8_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
@@ -11984,20 +11937,16 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-LABEL: max_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
; GFX9_DPP-NEXT: s_mov_b32 s0, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0
; GFX9_DPP-NEXT: s_brev_b32 s1, 1
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT: s_mov_b64 exec, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s1
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[4:5]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
@@ -12052,21 +12001,21 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[4:5]
; GFX9_DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB23_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT: ds_max_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_max_rtn_i64 v[7:8], v9, v[7:8]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB23_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
@@ -12081,15 +12030,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: max_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: s_mov_b32 s4, 0
; GFX1064_DPP-NEXT: s_brev_b32 s5, 1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s5
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s4, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s5, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -12156,30 +12103,31 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB23_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB23_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064_DPP-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[9:10]
@@ -12191,15 +12139,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: max_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: s_mov_b32 s0, 0
; GFX1032_DPP-NEXT: s_brev_b32 s1, 1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s0, v9, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s1, v10, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -12244,29 +12190,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB23_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[9:10]
@@ -12278,79 +12225,79 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: max_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT: s_mov_b32 s0, 0
-; GFX1164_DPP-NEXT: s_brev_b32 s1, 1
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s0
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
-; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s[4:5]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s[4:5]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_mov_b32 s4, 0
+; GFX1164_DPP-NEXT: s_brev_b32 s5, 1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s5
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1164_DPP-NEXT: v_readlane_b32 s0, v4, 31
-; GFX1164_DPP-NEXT: v_readlane_b32 s1, v3, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s0
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s1
+; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_gt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX1164_DPP-NEXT: s_mov_b64 exec, s[4:5]
+; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12369,29 +12316,30 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB23_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB23_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12406,55 +12354,54 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_DPP-LABEL: max_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: s_mov_b32 s0, 0
; GFX1132_DPP-NEXT: s_brev_b32 s1, 1
-; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s0, v9, s4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s1, v10, s4
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x80000000, 0, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -12462,27 +12409,28 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT: ds_max_rtn_i64 v[9:10], v0, v[9:10]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB23_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -12821,11 +12769,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX8_DPP-NEXT: v_bfrev_b32_e32 v2, -2
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX8_DPP-NEXT: s_mov_b64 exec, -1
-; GFX8_DPP-NEXT: s_nop 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_nop 1
; GFX8_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -12869,11 +12814,8 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2
-; GFX9_DPP-NEXT: v_bfrev_b32_e32 v2, -2
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9_DPP-NEXT: s_mov_b64 exec, -1
-; GFX9_DPP-NEXT: s_nop 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_nop 1
; GFX9_DPP-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
@@ -13771,21 +13713,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: min_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX8_DPP-NEXT: s_mov_b32 s6, -1
; GFX8_DPP-NEXT: s_brev_b32 s7, -2
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, s6
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX8_DPP-NEXT: s_mov_b64 exec, -1
; GFX8_DPP-NEXT: v_mov_b32_e32 v3, s6
+; GFX8_DPP-NEXT: v_bfrev_b32_e32 v1, -2
; GFX8_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
@@ -13839,22 +13777,22 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB26_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB26_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v7
; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v10
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX8_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8]
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1
@@ -13868,21 +13806,17 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: min_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9_DPP-NEXT: s_mov_b32 s6, -1
; GFX9_DPP-NEXT: s_brev_b32 s7, -2
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, s6
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, v7
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, v8
-; GFX9_DPP-NEXT: s_mov_b64 exec, -1
; GFX9_DPP-NEXT: v_mov_b32_e32 v3, s6
+; GFX9_DPP-NEXT: v_bfrev_b32_e32 v1, -2
; GFX9_DPP-NEXT: v_mov_b32_e32 v4, s7
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
@@ -13936,21 +13870,21 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB26_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT: ds_min_rtn_i64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_min_rtn_i64 v[7:8], v9, v[7:8]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB26_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7
; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v10
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[7:8]
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1
@@ -13964,15 +13898,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: min_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1064_DPP-NEXT: s_brev_b32 s7, -2
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -14039,29 +13971,30 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s7, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s8, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s10, 48
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s11, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[8:9]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB26_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB26_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064_DPP-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[9:10]
@@ -14073,15 +14006,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: min_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_brev_b32 s7, -2
-; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, s7
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, s6, v9, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, s7, v10, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
@@ -14126,28 +14057,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: v_readlane_b32 s7, v3, 15
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s7, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB26_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[9:10]
@@ -14159,79 +14091,79 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
;
; GFX1164_DPP-LABEL: min_i64_varying:
; GFX1164_DPP: ; %bb.0: ; %entry
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
; GFX1164_DPP-NEXT: s_brev_b32 s7, -2
-; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, s7
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_lt_i64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -14250,29 +14182,29 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s8, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[8:9], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s10, 48
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s11, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[8:9]
-; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB26_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB26_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -14287,82 +14219,82 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
;
; GFX1132_DPP-LABEL: min_i64_varying:
; GFX1132_DPP: ; %bb.0: ; %entry
-; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_brev_b32 s7, -2
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1
+; GFX1132_DPP-NEXT: s_brev_b32 s7, -2
+; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1132_DPP-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_mov_b32 v2, s7
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, s6, v9, s4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, s7, v10, s4
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0x7fffffff, 0, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s5, v4, 15
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: v_readlane_b32 s7, v3, 15
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s7, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT: ds_min_rtn_i64 v[9:10], v0, v[9:10]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB26_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -15634,85 +15566,84 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: umax_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB29_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: ds_max_rtn_u64 v[7:8], v9, v[7:8]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB29_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX8_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
@@ -15726,84 +15657,83 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
;
; GFX9_DPP-LABEL: umax_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v7, s[0:1]
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v8, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, 0, 0, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB29_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT: ds_max_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_max_rtn_u64 v[7:8], v9, v[7:8]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB29_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3
; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX9_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
@@ -15817,18 +15747,14 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
;
; GFX1064_DPP-LABEL: umax_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -15892,30 +15818,31 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB29_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB29_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064_DPP-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[9:10]
@@ -15927,13 +15854,11 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: umax_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v9, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, v10, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
@@ -15978,29 +15903,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB29_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[9:10]
@@ -16016,73 +15942,73 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_gt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16101,29 +16027,30 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB29_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB29_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -16142,49 +16069,49 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, 0
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, 0, v9, s4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, 0, v10, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, 0, 0, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -16192,27 +16119,28 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT: ds_max_rtn_u64 v[9:10], v0, v[9:10]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB29_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -17485,201 +17413,193 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
;
; GFX8_DPP-LABEL: umin_i64_varying:
; GFX8_DPP: ; %bb.0: ; %entry
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX8_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX8_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX8_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX8_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1]
-; GFX8_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1]
+; GFX8_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX8_DPP-NEXT: s_mov_b32 s6, -1
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX8_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX8_DPP-NEXT: s_nop 0
-; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX8_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX8_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX8_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX8_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX8_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX8_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX8_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX8_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX8_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX8_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX8_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8_DPP-NEXT: s_cbranch_execz .LBB32_2
; GFX8_DPP-NEXT: ; %bb.1:
-; GFX8_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX8_DPP-NEXT: v_mov_b32_e32 v9, s4
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, s4
; GFX8_DPP-NEXT: s_mov_b32 m0, -1
-; GFX8_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX8_DPP-NEXT: ds_min_rtn_u64 v[7:8], v9, v[7:8]
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX8_DPP-NEXT: .LBB32_2:
; GFX8_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX8_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s5, v10
-; GFX8_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s1, v8
+; GFX8_DPP-NEXT: v_readfirstlane_b32 s0, v7
+; GFX8_DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX8_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX8_DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX8_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8]
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s1
; GFX8_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
-; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s4
-; GFX8_DPP-NEXT: s_mov_b32 s3, 0xf000
-; GFX8_DPP-NEXT: s_mov_b32 s2, -1
+; GFX8_DPP-NEXT: v_mov_b32_e32 v0, s0
+; GFX8_DPP-NEXT: s_mov_b32 s7, 0xf000
; GFX8_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
; GFX8_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX8_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
; GFX8_DPP-NEXT: s_endpgm
;
; GFX9_DPP-LABEL: umin_i64_varying:
; GFX9_DPP: ; %bb.0: ; %entry
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v0
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9_DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX9_DPP-NEXT: v_mov_b32_e32 v9, 0
+; GFX9_DPP-NEXT: v_mbcnt_hi_u32_b32 v7, exec_hi, v7
; GFX9_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v1, -1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v7, s[0:1]
-; GFX9_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v8, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_e32 v3, -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v4, -1
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v2, -1, 0, s[0:1]
+; GFX9_DPP-NEXT: v_cndmask_b32_e64 v1, -1, v0, s[0:1]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
+; GFX9_DPP-NEXT: s_mov_b32 s6, -1
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9_DPP-NEXT: v_mov_b32_e32 v6, v4
; GFX9_DPP-NEXT: s_nop 0
-; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX9_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX9_DPP-NEXT: v_readlane_b32 s5, v4, 63
-; GFX9_DPP-NEXT: v_readlane_b32 s4, v3, 63
-; GFX9_DPP-NEXT: v_mov_b32_dpp v1, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9_DPP-NEXT: v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v6, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[1:2], v[5:6]
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9_DPP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9_DPP-NEXT: v_readlane_b32 s5, v2, 63
+; GFX9_DPP-NEXT: v_readlane_b32 s4, v1, 63
+; GFX9_DPP-NEXT: v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9_DPP-NEXT: v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
; GFX9_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX9_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8
; GFX9_DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9_DPP-NEXT: s_cbranch_execz .LBB32_2
; GFX9_DPP-NEXT: ; %bb.1:
-; GFX9_DPP-NEXT: v_mov_b32_e32 v10, s5
-; GFX9_DPP-NEXT: v_mov_b32_e32 v9, s4
-; GFX9_DPP-NEXT: ds_min_rtn_u64 v[9:10], v8, v[9:10]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, s5
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, s4
+; GFX9_DPP-NEXT: ds_min_rtn_u64 v[7:8], v9, v[7:8]
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9_DPP-NEXT: .LBB32_2:
; GFX9_DPP-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v1
-; GFX9_DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s5, v10
-; GFX9_DPP-NEXT: v_readfirstlane_b32 s4, v9
-; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s5
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s1, v8
+; GFX9_DPP-NEXT: v_readfirstlane_b32 s0, v7
+; GFX9_DPP-NEXT: v_mov_b32_e32 v7, v3
+; GFX9_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX9_DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[7:8]
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s1
; GFX9_DPP-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc
-; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s4
-; GFX9_DPP-NEXT: s_mov_b32 s3, 0xf000
-; GFX9_DPP-NEXT: s_mov_b32 s2, -1
+; GFX9_DPP-NEXT: v_mov_b32_e32 v0, s0
+; GFX9_DPP-NEXT: s_mov_b32 s7, 0xf000
; GFX9_DPP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
; GFX9_DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[0:3], 0
+; GFX9_DPP-NEXT: buffer_store_dwordx2 v[7:8], off, s[4:7], 0
; GFX9_DPP-NEXT: s_endpgm
;
; GFX1064_DPP-LABEL: umin_i64_varying:
; GFX1064_DPP: ; %bb.0: ; %entry
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX1064_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s[0:1]
-; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1]
+; GFX1064_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1]
; GFX1064_DPP-NEXT: v_mov_b32_e32 v5, v1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v7, v1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v6, v2
; GFX1064_DPP-NEXT: v_mov_b32_e32 v8, v2
; GFX1064_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17743,30 +17663,31 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s6, 32
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1064_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1064_DPP-NEXT: v_writelane_b32 v2, s8, 48
; GFX1064_DPP-NEXT: v_writelane_b32 v1, s9, 48
; GFX1064_DPP-NEXT: s_mov_b64 exec, s[6:7]
-; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1064_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1064_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1064_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1064_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064_DPP-NEXT: s_cbranch_execz .LBB32_2
; GFX1064_DPP-NEXT: ; %bb.1:
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1064_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1064_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10]
; GFX1064_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064_DPP-NEXT: buffer_gl0_inv
; GFX1064_DPP-NEXT: .LBB32_2:
; GFX1064_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1064_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1064_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1064_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1064_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064_DPP-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[9:10]
@@ -17778,13 +17699,11 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
;
; GFX1032_DPP-LABEL: umin_i64_varying:
; GFX1032_DPP: ; %bb.0: ; %entry
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v2, -1
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v9, s4
-; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, v10, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s4
+; GFX1032_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4
; GFX1032_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1032_DPP-NEXT: v_mov_b32_e32 v6, v2
@@ -17829,29 +17748,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1032_DPP-NEXT: v_readlane_b32 s5, v4, 15
; GFX1032_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1032_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4
-; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1032_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1032_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
+; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
; GFX1032_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2
; GFX1032_DPP-NEXT: ; %bb.1:
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1032_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1032_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10]
; GFX1032_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032_DPP-NEXT: buffer_gl0_inv
; GFX1032_DPP-NEXT: .LBB32_2:
; GFX1032_DPP-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032_DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1032_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1032_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1032_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1032_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[9:10]
@@ -17867,73 +17787,73 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v2, -1
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, 0
-; GFX1164_DPP-NEXT: v_and_b32_e32 v9, 0x3ff, v0
+; GFX1164_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s[0:1]
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s[0:1]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
+; GFX1164_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s[0:1]
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[5:6], v[3:4]
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, v2
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[7:8]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
-; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
; GFX1164_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: v_mov_b32_e32 v5, v1
; GFX1164_DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164_DPP-NEXT: v_readlane_b32 s4, v4, 31
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_readlane_b32 s5, v3, 31
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164_DPP-NEXT: v_mov_b32_e32 v7, s4
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v8, s5
; GFX1164_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
+; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_cmp_lt_u64_e32 vcc, v[3:4], v[5:6]
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX1164_DPP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
+; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1164_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -17952,29 +17872,30 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s7, 32
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mbcnt_hi_u32_b32 v9, exec_hi, v0
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1164_DPP-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1164_DPP-NEXT: s_mov_b64 s[0:1], s[4:5]
; GFX1164_DPP-NEXT: v_writelane_b32 v2, s8, 48
; GFX1164_DPP-NEXT: v_writelane_b32 v1, s9, 48
; GFX1164_DPP-NEXT: s_mov_b64 exec, s[6:7]
+; GFX1164_DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GFX1164_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1164_DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1164_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1164_DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1164_DPP-NEXT: s_cbranch_execz .LBB32_2
; GFX1164_DPP-NEXT: ; %bb.1:
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v12, s1
-; GFX1164_DPP-NEXT: v_mov_b32_e32 v11, s0
-; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, s1
+; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, s0
+; GFX1164_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10]
; GFX1164_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_DPP-NEXT: buffer_gl0_inv
; GFX1164_DPP-NEXT: .LBB32_2:
; GFX1164_DPP-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1164_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1164_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1164_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1164_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -17993,49 +17914,49 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_e32 v1, -1
; GFX1132_DPP-NEXT: v_mov_b32_e32 v2, -1
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v0
+; GFX1132_DPP-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v5, -1, v9, s4
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v6, -1, v10, s4
+; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v4, -1, 0, s4
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
+; GFX1132_DPP-NEXT: v_cndmask_b32_e64 v3, -1, v0, s4
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v3, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132_DPP-NEXT: v_mov_b32_dpp v4, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[5:6], v[3:4]
-; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v2
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
; GFX1132_DPP-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v6, v2
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[7:8]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v7, v4, -1, -1
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_permlanex16_b32 v8, v3, -1, -1
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v6, v7 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_mov_b32_dpp v5, v8 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132_DPP-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[3:4], v[5:6]
; GFX1132_DPP-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s1, v4, 31
+; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132_DPP-NEXT: v_readlane_b32 s6, v3, 15
; GFX1132_DPP-NEXT: v_readlane_b32 s0, v3, 31
; GFX1132_DPP-NEXT: v_mov_b32_dpp v2, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -18043,27 +17964,28 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
; GFX1132_DPP-NEXT: v_mov_b32_dpp v1, v3 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
; GFX1132_DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX1132_DPP-NEXT: v_mov_b32_e32 v0, 0
; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1
; GFX1132_DPP-NEXT: v_writelane_b32 v2, s5, 16
; GFX1132_DPP-NEXT: v_writelane_b32 v1, s6, 16
; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4
+; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9
; GFX1132_DPP-NEXT: s_mov_b32 s6, -1
-; GFX1132_DPP-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132_DPP-NEXT: ; implicit-def: $vgpr11_vgpr12
-; GFX1132_DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10
+; GFX1132_DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2
; GFX1132_DPP-NEXT: ; %bb.1:
-; GFX1132_DPP-NEXT: v_dual_mov_b32 v12, s1 :: v_dual_mov_b32 v11, s0
-; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[11:12], v10, v[11:12]
+; GFX1132_DPP-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_mov_b32 v9, s0
+; GFX1132_DPP-NEXT: ds_min_rtn_u64 v[9:10], v0, v[9:10]
; GFX1132_DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_DPP-NEXT: buffer_gl0_inv
; GFX1132_DPP-NEXT: .LBB32_2:
; GFX1132_DPP-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132_DPP-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v10
+; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v9
; GFX1132_DPP-NEXT: v_mov_b32_e32 v9, v1
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s1, v12
-; GFX1132_DPP-NEXT: v_readfirstlane_b32 s0, v11
; GFX1132_DPP-NEXT: v_mov_b32_e32 v10, v2
; GFX1132_DPP-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index 6bd0b11..d62ff37 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -59,7 +59,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX908-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
; GFX908-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
- ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+ ; GFX908-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX908-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
; GFX908-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX908-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX908-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -120,7 +121,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
- ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+ ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -177,7 +179,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX11_GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX11_GFX12-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
- ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+ ; GFX11_GFX12-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX11_GFX12-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF]], implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 353, 15, 15, 0, implicit $exec
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 354, 15, 15, 0, implicit $exec
@@ -186,8 +189,8 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, killed [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
; GFX11_GFX12-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 360, 15, 15, 0, implicit $exec
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
- ; GFX11_GFX12-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; GFX11_GFX12-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX11_GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
; GFX11_GFX12-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_]], 0, [[S_MOV_B32_]], [[COPY5]], 0, implicit $exec
; GFX11_GFX12-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
; GFX11_GFX12-NEXT: early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 6766c0c..946ee9e 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -61,7 +61,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX90A-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
; GFX90A-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
- ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+ ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
; GFX90A-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX90A-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -80,7 +81,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX90A-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
; GFX90A-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX90A-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
- ; GFX90A-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX90A-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A-NEXT: S_BRANCH %bb.2
; GFX90A-NEXT: {{ $}}
@@ -102,11 +103,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX90A-NEXT: bb.4 (%ir-block.35):
; GFX90A-NEXT: successors: %bb.3(0x80000000)
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+ ; GFX90A-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
; GFX90A-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX90A-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
- ; GFX90A-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
- ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
+ ; GFX90A-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
; GFX90A-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
; GFX90A-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
; GFX90A-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
@@ -142,7 +143,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY6]], [[COPY7]], implicit $exec
; GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY5]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
; GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
- ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+ ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+ ; GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
; GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -161,7 +163,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX940-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
; GFX940-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX940-NEXT: [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
- ; GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX940-NEXT: S_BRANCH %bb.2
; GFX940-NEXT: {{ $}}
@@ -183,11 +185,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX940-NEXT: bb.4 (%ir-block.35):
; GFX940-NEXT: successors: %bb.3(0x80000000)
; GFX940-NEXT: {{ $}}
- ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF1]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+ ; GFX940-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
; GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX940-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
- ; GFX940-NEXT: early-clobber %44:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
- ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %44, 0, 0, implicit $mode, implicit $exec
+ ; GFX940-NEXT: early-clobber %45:vgpr_32 = STRICT_WWM [[V_MOV_B32_dpp6]], implicit $exec
+ ; GFX940-NEXT: [[V_ADD_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %45, 0, 0, implicit $mode, implicit $exec
; GFX940-NEXT: [[COPY10:%[0-9]+]]:sreg_64_xexec = COPY [[COPY8]]
; GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_READFIRSTLANE_B32_]]
; GFX940-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_6]], 0, [[COPY11]], [[COPY10]], implicit $exec
@@ -219,7 +221,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[S_MOV_B32_]], implicit $exec
; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2147483648, implicit $exec
- ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_]], implicit-def dead $scc, implicit $exec
+ ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+ ; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 0, [[COPY]], 0, [[V_MOV_B32_e32_]], killed [[DEF1]], implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, killed [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
@@ -229,8 +232,8 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, killed [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
- ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
- ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF1]]
+ ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
; GFX11-NEXT: [[V_PERMLANEX16_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERMLANEX16_B32_e64 0, [[V_ADD_F32_e64_3]], 0, [[S_MOV_B32_1]], 0, [[S_MOV_B32_1]], [[COPY5]], 0, implicit $exec
; GFX11-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], killed [[V_PERMLANEX16_B32_e64_]], 228, 10, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
@@ -244,7 +247,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: early-clobber %2:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec
; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
- ; GFX11-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
; GFX11-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: S_BRANCH %bb.2
; GFX11-NEXT: {{ $}}
@@ -266,11 +269,11 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: bb.4 (%ir-block.32):
; GFX11-NEXT: successors: %bb.3(0x80000000)
; GFX11-NEXT: {{ $}}
- ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF2]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
+ ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[DEF3]], %bb.1, [[GLOBAL_ATOMIC_ADD_F32_SADDR_RTN]], %bb.2
; GFX11-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[PHI1]], implicit $exec
- ; GFX11-NEXT: early-clobber %46:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
- ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %46, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: early-clobber %47:vgpr_32 = STRICT_WWM [[V_WRITELANE_B32_]], implicit $exec
+ ; GFX11-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_READFIRSTLANE_B32_]], 0, killed %47, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY6]]
; GFX11-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD_F32_e64_5]], 0, [[V_READFIRSTLANE_B32_]], [[COPY8]], implicit $exec
; GFX11-NEXT: S_BRANCH %bb.3
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 311c609..2b18f47 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -816,10 +816,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2027,10 +2024,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3298,10 +3292,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4065,10 +4056,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5361,10 +5349,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7330,44 +7315,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7463,9 +7449,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7585,9 +7571,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7691,12 +7677,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -7813,42 +7799,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -8907,44 +8893,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -9008,9 +8995,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9096,9 +9083,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9168,12 +9155,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9259,42 +9246,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -10330,44 +10317,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -10431,9 +10419,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10519,9 +10507,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10591,12 +10579,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -10682,42 +10670,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -11235,44 +11223,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -11336,9 +11325,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11424,9 +11413,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11496,12 +11485,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11587,42 +11576,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -13305,44 +13294,45 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -13438,9 +13428,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13560,9 +13550,9 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13666,12 +13656,12 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -13788,42 +13778,42 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3
; GFX1132-DPP-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 9dc82b1..e3144ae 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1132-DPP-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 945583c..ddc1031 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -718,10 +718,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -1756,10 +1753,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2794,10 +2788,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0x7fc00000
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4742,51 +4733,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -4884,9 +4876,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5016,9 +5008,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -5130,12 +5122,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -5264,51 +5256,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -6188,51 +6179,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[7:8], v[7:8], v[7:8]
-; GFX9-DPP-NEXT: v_min_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[3:4]
-; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6]
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -6298,9 +6290,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6381,9 +6373,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0x7ff80000
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -6446,12 +6438,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
@@ -6550,51 +6542,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0x7ff80000
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[4:5], v[4:5], v[6:7]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -8238,51 +8229,52 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v11, 0x7ff80000
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
; GFX9-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
-; GFX9-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
-; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX9-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -8380,9 +8372,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8512,9 +8504,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8626,12 +8618,12 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
@@ -8760,51 +8752,50 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_mov_b32_e32 v9, 0x7ff80000
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[12:13], v[12:13], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[10:11], v[10:11], v[12:13]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[8:9]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1132-DPP-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 3bc0f25..f353edf 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -894,10 +894,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -2217,10 +2214,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -3540,10 +3534,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -4359,10 +4350,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -5681,10 +5669,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
-; GFX9-DPP-NEXT: s_mov_b64 exec, -1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v0, s[0:1]
; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
@@ -7650,44 +7635,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -7783,9 +7769,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -7905,9 +7891,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -8011,12 +7997,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -8133,42 +8119,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -9226,44 +9212,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -9327,9 +9314,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9415,9 +9402,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -9487,12 +9474,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -9578,42 +9565,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -10649,44 +10636,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -10750,9 +10738,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10838,9 +10826,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -10910,12 +10898,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11001,42 +10989,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -11554,44 +11542,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v6, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v4, v3, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v3, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v5
+; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v6
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[7:8]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -11655,9 +11644,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11743,9 +11732,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v5, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v6, v4, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -11815,12 +11804,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -11906,42 +11895,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, v2, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v4, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v5, v3, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v2, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3
; GFX1132-DPP-NEXT: ; %bb.1:
@@ -13623,44 +13612,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
-; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
-; GFX9-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_e32 v10, 0
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v8, 1
+; GFX9-DPP-NEXT: v_bfrev_b32_e32 v11, 1
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v9, v8, v1, s[0:1]
+; GFX9-DPP-NEXT: v_cndmask_b32_e64 v8, 0, v0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
-; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v11
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v8 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v9 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[12:13]
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
+; GFX9-DPP-NEXT: v_mov_b32_dpp v10, v8 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_mov_b32_dpp v11, v9 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
@@ -13756,9 +13746,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1064-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13878,9 +13868,9 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1032-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
@@ -13984,12 +13974,12 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s[0:1]
+; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
-; GFX1164-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s[0:1]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
@@ -14106,42 +14096,42 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, v8, v0, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, 0x80000000, v1, s0
+; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: v_cndmask_b32_e64 v11, v9, v1, s0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v9, 0, 0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3
; GFX1132-DPP-NEXT: ; %bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index c1b58f1..fbf8c203 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -93,18 +93,18 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
; DAGISEL11: ; %bb.0:
; DAGISEL11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
-; DAGISEL11-NEXT: v_dual_mov_b32 v1, v11 :: v_dual_mov_b32 v0, v10
+; DAGISEL11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v10
; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL11-NEXT: s_or_saveexec_b32 s0, -1
; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; DAGISEL11-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
-; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v2, v0, v13, s0
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11-NEXT: v_cndmask_b32_e64 v1, v1, v12, s0
; DAGISEL11-NEXT: s_mov_b32 exec_lo, s0
-; DAGISEL11-NEXT: v_mov_b32_e32 v2, v0
-; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; DAGISEL11-NEXT: v_mov_b32_e32 v3, v1
-; DAGISEL11-NEXT: global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; DAGISEL11-NEXT: v_mov_b32_e32 v4, v2
+; DAGISEL11-NEXT: global_store_b64 v[8:9], v[3:4], off
; DAGISEL11-NEXT: s_endpgm
;
; GISEL10-LABEL: set_inactive_chain_arg_64:
@@ -127,16 +127,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
; DAGISEL10: ; %bb.0:
; DAGISEL10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
-; DAGISEL10-NEXT: v_mov_b32_e32 v1, v11
-; DAGISEL10-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL10-NEXT: v_mov_b32_e32 v0, v11
+; DAGISEL10-NEXT: v_mov_b32_e32 v1, v10
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
; DAGISEL10-NEXT: s_or_saveexec_b32 s0, -1
-; DAGISEL10-NEXT: v_cndmask_b32_e64 v0, v0, v12, s0
-; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v13, s0
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v2, v0, v13, s0
+; DAGISEL10-NEXT: v_cndmask_b32_e64 v1, v1, v12, s0
; DAGISEL10-NEXT: s_mov_b32 exec_lo, s0
-; DAGISEL10-NEXT: v_mov_b32_e32 v2, v0
; DAGISEL10-NEXT: v_mov_b32_e32 v3, v1
-; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10-NEXT: v_mov_b32_e32 v4, v2
+; DAGISEL10-NEXT: global_store_dwordx2 v[8:9], v[3:4], off
; DAGISEL10-NEXT: s_endpgm
;
; GISEL11_W64-LABEL: set_inactive_chain_arg_64:
@@ -162,18 +162,19 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
; DAGISEL11_W64: ; %bb.0:
; DAGISEL11_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v11
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v0, v11
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v1, v10
; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL11_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
-; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v2, v0, v13, s[0:1]
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; DAGISEL11_W64-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[0:1]
; DAGISEL11_W64-NEXT: s_mov_b64 exec, s[0:1]
-; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; DAGISEL11_W64-NEXT: v_mov_b32_e32 v2, v0
; DAGISEL11_W64-NEXT: v_mov_b32_e32 v3, v1
-; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[2:3], off
+; DAGISEL11_W64-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; DAGISEL11_W64-NEXT: v_mov_b32_e32 v4, v2
+; DAGISEL11_W64-NEXT: global_store_b64 v[8:9], v[3:4], off
; DAGISEL11_W64-NEXT: s_endpgm
;
; GISEL10_W64-LABEL: set_inactive_chain_arg_64:
@@ -196,16 +197,16 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
; DAGISEL10_W64: ; %bb.0:
; DAGISEL10_W64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v11
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v10
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v0, v11
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v1, v10
; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
; DAGISEL10_W64-NEXT: s_or_saveexec_b64 s[0:1], -1
-; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[0:1]
-; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[0:1]
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v2, v0, v13, s[0:1]
+; DAGISEL10_W64-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[0:1]
; DAGISEL10_W64-NEXT: s_mov_b64 exec, s[0:1]
-; DAGISEL10_W64-NEXT: v_mov_b32_e32 v2, v0
; DAGISEL10_W64-NEXT: v_mov_b32_e32 v3, v1
-; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[2:3], off
+; DAGISEL10_W64-NEXT: v_mov_b32_e32 v4, v2
+; DAGISEL10_W64-NEXT: global_store_dwordx2 v[8:9], v[3:4], off
; DAGISEL10_W64-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.chain.arg.i64(i64 %active, i64 %inactive) #0
%wwm = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 6dc4a2c..6fb5a9c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,16 +5,14 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -46,19 +44,19 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -96,11 +94,9 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s4, s[4:7], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GCN-NEXT: v_mov_b32_e32 v1, s8
; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: s_mov_b64 exec, s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 42, v1, s[2:3]
; GCN-NEXT: s_mov_b64 exec, s[2:3]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s4, 56
@@ -145,17 +141,15 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s7, 0x40400000
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -170,21 +164,21 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x4010cccc
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0xcccccccd
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -198,17 +192,15 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s7, 0x10001
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -222,17 +214,15 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s7, 0x3c003c00
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -247,21 +237,19 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %
; GCN-LABEL: set_inactive_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: s_mov_b32 s10, 1
-; GCN-NEXT: s_mov_b32 s11, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, 1, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -276,21 +264,19 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
; GCN-LABEL: set_inactive_v2f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: s_mov_b32 s10, 1.0
-; GCN-NEXT: s_mov_b32 s11, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -304,17 +290,15 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT: s_mov_b32 s7, 0x3f803f80
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -329,21 +313,20 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %
; GCN-LABEL: set_inactive_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: s_mov_b32 s10, 0x10001
-; GCN-NEXT: s_mov_b32 s11, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x10001
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -358,21 +341,20 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half>
; GCN-LABEL: set_inactive_v4f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: s_mov_b32 s10, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s11, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -387,21 +369,20 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
; GCN-LABEL: set_inactive_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: s_mov_b32 s10, 0x3f803f80
-; GCN-NEXT: s_mov_b32 s11, s10
-; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -416,19 +397,19 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_mov_b64 exec, -1
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v2, s[0:1]
+; GCN-NEXT: s_mov_b64 exec, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
@@ -442,16 +423,14 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -465,16 +444,14 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -488,16 +465,14 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
@@ -511,16 +486,14 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s6, s[2:3], 0x2c
+; GCN-NEXT: s_load_dword s4, s[2:3], 0x2c
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: s_mov_b64 exec, -1
+; GCN-NEXT: v_mov_b32_e32 v1, s4
+; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, v1, s[4:5]
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 8a5f753..b0fb24e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -111,7 +111,7 @@ body: |
; GCN-NEXT: $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5
; GCN-NEXT: $sgpr22 = IMPLICIT_DEF
; GCN-NEXT: renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2
- ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+ ; GCN-NEXT: dead $vgpr1 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.2(0x80000000)
@@ -212,7 +212,7 @@ body: |
$sgpr22 = IMPLICIT_DEF
SI_SPILL_S32_SAVE $sgpr22, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
- %0:vgpr_32 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+ %0:vgpr_32 = V_SET_INACTIVE_B32 0, $vgpr0, 0, 0, $sgpr_null, implicit $exec, implicit-def $scc
bb.1:
KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index ff692ac..92117e0 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1708,8 +1708,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1032-NEXT: s_or_saveexec_b32 s0, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s7, s0
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, s6, s0
; GFX1032-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
@@ -1722,8 +1722,8 @@ define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in)
; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1]
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s7, s[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, s6, s[0:1]
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index 64a7c445..3013aab 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -40,9 +40,6 @@
define amdgpu_vs void @no_wqm_in_vs() {
ret void
}
- define amdgpu_ps void @preloaded_set_inactive() {
- ret void
- }
...
---
@@ -155,7 +152,7 @@ registers:
- { id: 9, class: sreg_32, preferred-register: '' }
- { id: 10, class: vgpr_32, preferred-register: '' }
- { id: 11, class: vgpr_32, preferred-register: '' }
- - { id: 12, class: sreg_32, preferred-register: '' }
+ - { id: 12, class: vgpr_32, preferred-register: '' }
- { id: 13, class: vgpr_32, preferred-register: '' }
- { id: 14, class: vgpr_32, preferred-register: '' }
- { id: 15, class: vgpr_32, preferred-register: '' }
@@ -179,7 +176,8 @@ body: |
%8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, implicit $exec
%16:vgpr_32 = COPY %8.sub1
%11:vgpr_32 = COPY %16
- %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc
+ %17:sreg_64_xexec = IMPLICIT_DEF
+ %10:vgpr_32 = V_SET_INACTIVE_B32 0, %11, 0, undef %12, undef %17, implicit $exec, implicit-def $scc
%14:vgpr_32 = COPY %7
%13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
early-clobber %15:vgpr_32 = STRICT_WWM killed %13, implicit $exec
@@ -298,8 +296,9 @@ body: |
%0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
%1:vgpr_32 = COPY $vgpr0
%2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, implicit $exec
- %2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc
- %2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc
+ %4:sreg_64_xexec = IMPLICIT_DEF
+ %2.sub0:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub0:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc
+ %2.sub1:vreg_64 = V_SET_INACTIVE_B32 0, %2.sub1:vreg_64, 0, 0, undef %4, implicit $exec, implicit-def $scc
%3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
$vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec
$vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
@@ -446,19 +445,3 @@ body: |
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
...
-
----
-# Preserve V_SET_INACTIVE with exec mask already specified
-#CHECK-LABEL: name: preloaded_set_inactive
-#CHECK: V_SET_INACTIVE_B32
-name: preloaded_set_inactive
-tracksRegLiveness: true
-body: |
- bb.0:
- liveins: $vgpr1, $vgpr2
-
- %0:vgpr_32 = COPY $vgpr1
- %1:vgpr_32 = COPY $vgpr2
- %mask:sreg_64 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- %value:vgpr_32 = V_SET_INACTIVE_B32 %0:vgpr_32, %1:vgpr_32, implicit $exec, implicit-def $scc, implicit %mask:sreg_64
-...
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 47e1897..b35ef64 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -30,15 +30,15 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[40:41]
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -170,11 +170,10 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
; GFX9-O0-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
-; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[36:37]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -201,6 +200,9 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[34:35]
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -352,32 +354,32 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
-; GFX9-O0-NEXT: s_mov_b32 s42, s6
+; GFX9-O0-NEXT: s_mov_b32 s40, s6
; GFX9-O0-NEXT: s_mov_b32 s34, s4
-; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
-; GFX9-O0-NEXT: s_mov_b32 s43, s7
-; GFX9-O0-NEXT: s_mov_b32 s44, s43
-; GFX9-O0-NEXT: s_mov_b32 s45, s42
+; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
+; GFX9-O0-NEXT: s_mov_b32 s41, s7
+; GFX9-O0-NEXT: s_mov_b32 s42, s41
+; GFX9-O0-NEXT: s_mov_b32 s43, s40
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b32 s35, s5
-; GFX9-O0-NEXT: s_mov_b32 s46, s35
+; GFX9-O0-NEXT: s_mov_b32 s44, s35
; GFX9-O0-NEXT: s_mov_b32 s36, s34
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT: s_mov_b32 s37, s46
-; GFX9-O0-NEXT: s_mov_b32 s38, s45
-; GFX9-O0-NEXT: s_mov_b32 s39, s44
+; GFX9-O0-NEXT: s_mov_b32 s37, s44
+; GFX9-O0-NEXT: s_mov_b32 s38, s43
+; GFX9-O0-NEXT: s_mov_b32 s39, s42
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
-; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[40:41]
; GFX9-O0-NEXT: s_getpc_b64 s[42:43]
; GFX9-O0-NEXT: s_add_u32 s42, s42, strict_wwm_called@rel32@lo+4
; GFX9-O0-NEXT: s_addc_u32 s43, s43, strict_wwm_called@rel32@hi+12
@@ -396,8 +398,8 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00
; GFX9-O0-NEXT: s_mov_b32 s33, s48
@@ -417,11 +419,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4
@@ -539,20 +539,20 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-LABEL: strict_wwm_call_i64:
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s50, s33
+; GFX9-O0-NEXT: s_mov_b32 s48, s33
; GFX9-O0-NEXT: s_mov_b32 s33, s32
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
@@ -560,48 +560,59 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000
-; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0
; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1
-; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 1
; GFX9-O0-NEXT: s_mov_b32 s34, s8
-; GFX9-O0-NEXT: s_mov_b32 s40, s6
+; GFX9-O0-NEXT: s_mov_b32 s38, s6
; GFX9-O0-NEXT: s_mov_b32 s36, s4
-; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s7
-; GFX9-O0-NEXT: s_mov_b32 s35, s41
-; GFX9-O0-NEXT: s_mov_b32 s42, s40
+; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
+; GFX9-O0-NEXT: s_mov_b32 s39, s7
+; GFX9-O0-NEXT: s_mov_b32 s35, s39
+; GFX9-O0-NEXT: s_mov_b32 s44, s38
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
; GFX9-O0-NEXT: s_mov_b32 s37, s5
-; GFX9-O0-NEXT: s_mov_b32 s43, s37
-; GFX9-O0-NEXT: s_mov_b32 s44, s36
-; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45_sgpr46_sgpr47
-; GFX9-O0-NEXT: s_mov_b32 s45, s43
-; GFX9-O0-NEXT: s_mov_b32 s46, s42
-; GFX9-O0-NEXT: s_mov_b32 s47, s35
-; GFX9-O0-NEXT: v_writelane_b32 v0, s44, 2
-; GFX9-O0-NEXT: v_writelane_b32 v0, s45, 3
-; GFX9-O0-NEXT: v_writelane_b32 v0, s46, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s47, 5
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT: s_mov_b32 s45, s37
+; GFX9-O0-NEXT: s_mov_b32 s40, s36
+; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
+; GFX9-O0-NEXT: s_mov_b32 s41, s45
+; GFX9-O0-NEXT: s_mov_b32 s42, s44
+; GFX9-O0-NEXT: s_mov_b32 s43, s35
+; GFX9-O0-NEXT: v_writelane_b32 v1, s40, 0
+; GFX9-O0-NEXT: v_writelane_b32 v1, s41, 1
+; GFX9-O0-NEXT: v_writelane_b32 v1, s42, 2
+; GFX9-O0-NEXT: v_writelane_b32 v1, s43, 3
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b32 s35, s9
; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT: s_mov_b32 s38, s35
; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s36
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37
+; GFX9-O0-NEXT: s_mov_b32 s40, s37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr42_sgpr43
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s38
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[38:39], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s40
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[38:39]
; GFX9-O0-NEXT: s_mov_b64 exec, s[38:39]
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s35
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 killed $sgpr34_sgpr35
+; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 killed $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: v_writelane_b32 v1, s34, 4
+; GFX9-O0-NEXT: v_writelane_b32 v1, s35, 5
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, s36
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[34:35]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
; GFX9-O0-NEXT: s_mov_b32 s34, 32
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[8:9]
+; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s34, v[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
; GFX9-O0-NEXT: s_getpc_b64 s[34:35]
; GFX9-O0-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
; GFX9-O0-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
@@ -610,24 +621,24 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1]
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37]
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 0
-; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 1
-; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2
-; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3
-; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4
-; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5
+; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4
+; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5
+; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0
+; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1
+; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2
+; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49]
+; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
; GFX9-O0-NEXT: ; implicit-def: $sgpr40
@@ -645,21 +656,21 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: ; kill: killed $vgpr0
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000
-; GFX9-O0-NEXT: s_mov_b32 s33, s50
+; GFX9-O0-NEXT: s_mov_b32 s33, s48
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
;
@@ -671,7 +682,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
@@ -682,25 +692,25 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[34:35]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8
; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-O3-NEXT: s_getpc_b64 s[34:35]
; GFX9-O3-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
; GFX9-O3-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
; GFX9-O3-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[36:37]
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
@@ -735,8 +745,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_mov_b32 s36, s4
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -747,73 +759,82 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: s_mov_b32 s34, 5
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[36:39], s34 offen
-; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
+; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s42, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr42 killed $sgpr42 def $sgpr42_sgpr43
-; GFX9-O0-NEXT: s_mov_b32 s43, s35
+; GFX9-O0-NEXT: s_mov_b32 s44, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr44 killed $sgpr44 def $sgpr44_sgpr45
+; GFX9-O0-NEXT: s_mov_b32 s45, s35
+; GFX9-O0-NEXT: s_mov_b32 s42, s45
+; GFX9-O0-NEXT: ; implicit-def: $sgpr46_sgpr47
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41]
+; GFX9-O0-NEXT: s_mov_b32 s35, s44
+; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr43
+; GFX9-O0-NEXT: ; implicit-def: $sgpr43
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr43
+; GFX9-O0-NEXT: ; implicit-def: $sgpr43
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s43
-; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[40:41]
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr42_sgpr43
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[40:41]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr35
+; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr35
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[36:39], s34 offen
-; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[36:39], s34 offen offset:16
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[36:39], s34 offen
+; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
@@ -823,50 +844,38 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: s_mov_b32 s36, -1
-; GFX9-O3-NEXT: s_brev_b32 s37, -2
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s36
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s37
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s36
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s37
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s36
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s37
-; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[34:35]
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5
; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[4:7], 0 offen
+; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[4:7], 0 offen offset:16
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -874,6 +883,7 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
@@ -908,9 +918,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -961,110 +973,113 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: ; implicit-def: $sgpr34
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v34, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v35, s5
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v39, s17
-; GFX9-O0-NEXT: v_mov_b32_e32 v38, s18
-; GFX9-O0-NEXT: v_mov_b32_e32 v37, s19
-; GFX9-O0-NEXT: v_mov_b32_e32 v36, s20
-; GFX9-O0-NEXT: v_mov_b32_e32 v35, s21
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT: v_mov_b32_e32 v40, s18
+; GFX9-O0-NEXT: v_mov_b32_e32 v39, s19
+; GFX9-O0-NEXT: v_mov_b32_e32 v38, s20
+; GFX9-O0-NEXT: v_mov_b32_e32 v37, s21
+; GFX9-O0-NEXT: v_mov_b32_e32 v36, s22
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v46, s23
-; GFX9-O0-NEXT: v_mov_b32_e32 v45, s24
-; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25
-; GFX9-O0-NEXT: v_mov_b32_e32 v43, s26
-; GFX9-O0-NEXT: v_mov_b32_e32 v42, s27
-; GFX9-O0-NEXT: v_mov_b32_e32 v41, s28
-; GFX9-O0-NEXT: v_mov_b32_e32 v40, s29
+; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24
+; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25
+; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26
+; GFX9-O0-NEXT: v_mov_b32_e32 v43, s27
+; GFX9-O0-NEXT: v_mov_b32_e32 v42, s28
+; GFX9-O0-NEXT: v_mov_b32_e32 v41, s29
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v39
-; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v38
-; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v37
-; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v36
-; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v35
-; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v40
+; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v39
+; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v38
+; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v37
+; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v36
+; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(5)
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v34
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v46
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v45
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v44
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v42
-; GFX9-O0-NEXT: v_mov_b32_e32 v24, v41
-; GFX9-O0-NEXT: v_mov_b32_e32 v25, v40
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v35
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46
+; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v24, v42
+; GFX9-O0-NEXT: v_mov_b32_e32 v25, v41
; GFX9-O0-NEXT: s_waitcnt vmcnt(5)
-; GFX9-O0-NEXT: v_mov_b32_e32 v26, v39
+; GFX9-O0-NEXT: v_mov_b32_e32 v26, v40
; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT: v_mov_b32_e32 v27, v38
+; GFX9-O0-NEXT: v_mov_b32_e32 v27, v39
; GFX9-O0-NEXT: s_waitcnt vmcnt(3)
-; GFX9-O0-NEXT: v_mov_b32_e32 v28, v37
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v38
; GFX9-O0-NEXT: s_waitcnt vmcnt(2)
-; GFX9-O0-NEXT: v_mov_b32_e32 v29, v36
+; GFX9-O0-NEXT: v_mov_b32_e32 v29, v37
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v30, v35
-; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr34 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v30, v36
+; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr35 killed $exec
; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
@@ -1104,63 +1119,100 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
+; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-O0-NEXT: s_mov_b32 s37, s39
+; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35]
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: s_mov_b32 s36, s38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v32
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v34
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35]
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, v9
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v32
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v34
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35]
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, v7
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v32
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v34
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35]
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, v4
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, v5
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v32
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v34
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s36
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s37
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v32, s37
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v32, v32, v0, s[34:35]
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, v3
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s36
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v33, v33, v0, s[34:35]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36
+; GFX9-O0-NEXT: ; implicit-def: $sgpr36
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, v32
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v32
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v33
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v34
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10
@@ -1219,8 +1271,10 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, -1
; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -1242,32 +1296,32 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16
; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35]
; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v2, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v1, s[34:35]
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v32
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v33
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35]
; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v4, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v3, s[34:35]
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v32
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v33
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35]
; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v6, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v5, s[34:35]
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v32
; GFX9-O3-NEXT: v_mov_b32_e32 v6, v33
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35]
; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v8, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v7, s[34:35]
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v7, v32
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v33
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35]
; GFX9-O3-NEXT: v_cndmask_b32_e64 v33, 0, v10, s[34:35]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v32, 0, v9, s[34:35]
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v9, v32
; GFX9-O3-NEXT: v_mov_b32_e32 v10, v33
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 7f0db3e..7fecab0 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -26,15 +26,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -147,11 +147,10 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -178,6 +177,9 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -336,40 +338,40 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s3, s9
-; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
-; GFX9-O0-NEXT: s_mov_b32 s9, s17
-; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT: s_mov_b32 s3, s7
+; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT: s_mov_b32 s7, s9
+; GFX9-O0-NEXT: s_mov_b32 s16, s8
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s9
-; GFX9-O0-NEXT: s_mov_b32 s18, s8
+; GFX9-O0-NEXT: s_mov_b32 s17, s7
+; GFX9-O0-NEXT: s_mov_b32 s18, s6
; GFX9-O0-NEXT: s_mov_b32 s19, s3
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT: s_mov_b32 s3, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9
+; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3]
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -398,13 +400,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6
-; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7
-; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8
-; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9
-; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4
-; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5
-; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10
+; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -434,15 +436,13 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21]
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
@@ -587,41 +587,56 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s6, s19
-; GFX9-O0-NEXT: s_mov_b32 s7, s18
-; GFX9-O0-NEXT: s_mov_b32 s15, s17
+; GFX9-O0-NEXT: s_mov_b32 s6, s9
+; GFX9-O0-NEXT: s_mov_b32 s7, s8
+; GFX9-O0-NEXT: s_mov_b32 s8, s17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s15
+; GFX9-O0-NEXT: s_mov_b32 s17, s8
; GFX9-O0-NEXT: s_mov_b32 s18, s7
; GFX9-O0-NEXT: s_mov_b32 s19, s6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT: s_mov_b32 s15, s7
+; GFX9-O0-NEXT: s_mov_b32 s8, s3
+; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9]
; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8
+; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr2
+; GFX9-O0-NEXT: ; implicit-def: $sgpr2
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT: s_mov_b32 s2, 32
+; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -631,11 +646,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1
; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GFX9-O0-NEXT: s_mov_b32 s9, s0
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT: s_mov_b32 s0, 32
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
; GFX9-O0-NEXT: s_getpc_b64 s[0:1]
; GFX9-O0-NEXT: s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4
; GFX9-O0-NEXT: s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12
@@ -650,20 +660,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9-O0-NEXT: ; implicit-def: $sgpr15
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6
-; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7
-; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8
-; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9
-; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4
-; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5
+; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -701,14 +711,13 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 60
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: s_getpc_b64 s[2:3]
@@ -717,6 +726,7 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21]
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
@@ -724,14 +734,14 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O3-NEXT: s_mov_b32 s13, s7
; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc
+; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4
@@ -763,109 +773,103 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
; GFX9-O0-NEXT: s_mov_b32 s4, 5
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0
; GFX9-O0-NEXT: s_mov_b32 s4, 0
-; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s8, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
-; GFX9-O0-NEXT: s_mov_b32 s9, s5
+; GFX9-O0-NEXT: s_mov_b32 s10, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
+; GFX9-O0-NEXT: s_mov_b32 s11, s5
+; GFX9-O0-NEXT: s_mov_b32 s8, s11
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT: s_mov_b32 s5, s10
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
; GFX9-O0-NEXT: s_endpgm
;
; GFX9-O3-LABEL: _amdgpu_cs_main:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX9-O3-NEXT: s_mov_b32 s6, -1
-; GFX9-O3-NEXT: s_brev_b32 s7, -2
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5]
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5
; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
; GFX9-O3-NEXT: s_endpgm
%tmp17 = shl i32 %index, 5
%tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
@@ -915,15 +919,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s4
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1036,11 +1040,10 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3]
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1067,6 +1070,9 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT: s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[0:1]
; GFX9-O0-NEXT: s_nop 1
; GFX9-O0-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
@@ -1225,40 +1231,40 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O0-NEXT: v_readlane_b32 s1, v7, 3
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
-; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-O0-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s6, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s7, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s3, s9
-; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
-; GFX9-O0-NEXT: s_mov_b32 s9, s17
-; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT: s_mov_b32 s3, s7
+; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT: s_mov_b32 s7, s9
+; GFX9-O0-NEXT: s_mov_b32 s16, s8
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s9
-; GFX9-O0-NEXT: s_mov_b32 s18, s8
+; GFX9-O0-NEXT: s_mov_b32 s17, s7
+; GFX9-O0-NEXT: s_mov_b32 s18, s6
; GFX9-O0-NEXT: s_mov_b32 s19, s3
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT: s_mov_b32 s3, 0
-; GFX9-O0-NEXT: v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7
+; GFX9-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-O0-NEXT: v_writelane_b32 v1, s6, 8
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 9
+; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 10
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s2
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[2:3]
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 56
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -1287,13 +1293,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 6
-; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 7
-; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 8
-; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 9
-; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 4
-; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 5
-; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 10
+; GFX9-O0-NEXT: v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT: v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT: v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT: v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT: v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT: v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT: v_readlane_b32 s4, v1, 8
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1323,15 +1329,13 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dword s4, s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 56
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[20:21]
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
@@ -1476,41 +1480,56 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: v_readlane_b32 s1, v12, 3
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
-; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_writelane_b32 v0, s8, 4
-; GFX9-O0-NEXT: v_writelane_b32 v0, s9, 5
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O0-NEXT: s_mov_b32 s6, s19
-; GFX9-O0-NEXT: s_mov_b32 s7, s18
-; GFX9-O0-NEXT: s_mov_b32 s15, s17
+; GFX9-O0-NEXT: s_mov_b32 s6, s9
+; GFX9-O0-NEXT: s_mov_b32 s7, s8
+; GFX9-O0-NEXT: s_mov_b32 s8, s17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
-; GFX9-O0-NEXT: s_mov_b32 s17, s15
+; GFX9-O0-NEXT: s_mov_b32 s17, s8
; GFX9-O0-NEXT: s_mov_b32 s18, s7
; GFX9-O0-NEXT: s_mov_b32 s19, s6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s16, 6
-; GFX9-O0-NEXT: v_writelane_b32 v0, s17, 7
-; GFX9-O0-NEXT: v_writelane_b32 v0, s18, 8
-; GFX9-O0-NEXT: v_writelane_b32 v0, s19, 9
-; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[24:27], 0 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_writelane_b32 v1, s16, 4
+; GFX9-O0-NEXT: v_writelane_b32 v1, s17, 5
+; GFX9-O0-NEXT: v_writelane_b32 v1, s18, 6
+; GFX9-O0-NEXT: v_writelane_b32 v1, s19, 7
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s7
+; GFX9-O0-NEXT: s_mov_b32 s15, s7
+; GFX9-O0-NEXT: s_mov_b32 s8, s3
+; GFX9-O0-NEXT: ; implicit-def: $sgpr16_sgpr17
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s15
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[8:9]
; GFX9-O0-NEXT: s_mov_b64 exec, s[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s2
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s3
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr6 killed $sgpr6 killed $sgpr6_sgpr7
+; GFX9-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT: v_writelane_b32 v1, s2, 8
+; GFX9-O0-NEXT: v_writelane_b32 v1, s3, 9
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[24:27], 0 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, s6
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr2
+; GFX9-O0-NEXT: ; implicit-def: $sgpr2
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6
+; GFX9-O0-NEXT: s_mov_b32 s2, 32
+; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s2, v[8:9]
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10
; GFX9-O0-NEXT: s_mov_b64 s[6:7], 60
; GFX9-O0-NEXT: s_mov_b32 s2, s0
; GFX9-O0-NEXT: s_mov_b32 s0, s1
@@ -1520,11 +1539,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: s_addc_u32 s0, s0, s1
; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
; GFX9-O0-NEXT: s_mov_b32 s9, s0
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8
-; GFX9-O0-NEXT: s_mov_b32 s0, 32
-; GFX9-O0-NEXT: ; implicit-def: $sgpr2_sgpr3
-; GFX9-O0-NEXT: v_lshrrev_b64 v[10:11], s0, v[8:9]
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
; GFX9-O0-NEXT: s_getpc_b64 s[0:1]
; GFX9-O0-NEXT: s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4
; GFX9-O0-NEXT: s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12
@@ -1539,20 +1553,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: ; implicit-def: $sgpr6_sgpr7
; GFX9-O0-NEXT: ; implicit-def: $sgpr15
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[24:27], 0 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 6
-; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 7
-; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 8
-; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 9
-; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 4
-; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 5
+; GFX9-O0-NEXT: v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT: v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT: v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT: v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT: v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT: v_readlane_b32 s5, v2, 9
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1590,14 +1604,13 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O3-NEXT: s_mov_b64 exec, s[12:13]
; GFX9-O3-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
; GFX9-O3-NEXT: s_load_dwordx4 s[16:19], s[2:3], 0x24
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s5
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[8:9], -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, 0, v0, s[8:9]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[8:9]
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O3-NEXT: s_add_u32 s8, s2, 60
; GFX9-O3-NEXT: s_addc_u32 s9, s3, 0
; GFX9-O3-NEXT: s_getpc_b64 s[2:3]
@@ -1606,6 +1619,7 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O3-NEXT: s_load_dwordx2 s[22:23], s[2:3], 0x0
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v3, 20, v3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 10, v4
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, 0, v0, s[20:21]
; GFX9-O3-NEXT: v_or3_b32 v3, v5, v4, v3
; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1]
; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25]
@@ -1613,14 +1627,14 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O3-NEXT: s_mov_b32 s13, s7
; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3
; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27]
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, v6
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23]
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0
; GFX9-O3-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v7, vcc
+; GFX9-O3-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-O3-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
; GFX9-O3-NEXT: s_mov_b64 exec, s[20:21]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v4
@@ -1652,109 +1666,103 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
; GFX9-O0-NEXT: s_mov_b32 s4, 5
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0
; GFX9-O0-NEXT: s_mov_b32 s4, 0
-; GFX9-O0-NEXT: buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
-; GFX9-O0-NEXT: s_mov_b32 s8, -1
-; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
-; GFX9-O0-NEXT: s_mov_b32 s9, s5
+; GFX9-O0-NEXT: s_mov_b32 s10, -1
+; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 def $sgpr10_sgpr11
+; GFX9-O0-NEXT: s_mov_b32 s11, s5
+; GFX9-O0-NEXT: s_mov_b32 s8, s11
+; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT: s_mov_b32 s5, s10
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v6
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: ; implicit-def: $sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9
-; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7]
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT: ; implicit-def: $sgpr8_sgpr9
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[6:7]
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
+; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v2
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v9
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
; GFX9-O0-NEXT: ; implicit-def: $sgpr5
-; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9
-; GFX9-O0-NEXT: buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
-; GFX9-O0-NEXT: buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-O0-NEXT: buffer_store_dwordx4 v[6:9], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT: buffer_store_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
; GFX9-O0-NEXT: s_endpgm
;
; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_load_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
; GFX9-O3-NEXT: s_or_saveexec_b64 s[4:5], -1
-; GFX9-O3-NEXT: s_mov_b32 s6, -1
-; GFX9-O3-NEXT: s_brev_b32 s7, -2
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT: v_bfrev_b32_e32 v1, -2
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, v8
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s7
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, v9
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, v10
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s6
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v3, v1, v9, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, -1, v8, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v5, v1, v11, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, -1, v10, s[4:5]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, v11
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, v12
-; GFX9-O3-NEXT: s_mov_b64 exec, -1
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v7, v1, v13, s[4:5]
+; GFX9-O3-NEXT: v_cndmask_b32_e64 v6, -1, v12, s[4:5]
; GFX9-O3-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1
-; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3
; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5
; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
-; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT: v_mov_b32_e32 v13, v7
+; GFX9-O3-NEXT: buffer_store_dwordx4 v[8:11], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT: buffer_store_dwordx2 v[12:13], v0, s[0:3], 0 offen offset:16
; GFX9-O3-NEXT: s_endpgm
%tmp17 = shl i32 %index, 5
%tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f6db9c4..176dfee 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,18 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
-; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
; PTX-LABEL: grid_const_int(
-; PTX-NOT: ld.u32
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
-;
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT: add.s32 %r3, %r2, %r1;
+; PTX-NEXT: st.global.u32 [%rd2], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
-; OPT-NOT: alloca
-; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
-;
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
+; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
+; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
+; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4
+; OPT-NEXT: ret void
%tmp = load i32, ptr %input1, align 4
%add = add i32 %tmp, %input2
store i32 %add, ptr %out
@@ -24,19 +36,29 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
; PTX-LABEL: grid_const_struct(
; PTX: {
-; PTX-NOT: ld.u32
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0];
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4];
-;
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u64 %rd1, [grid_const_struct_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_struct_param_0];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_struct_param_0+4];
+; PTX-NEXT: add.s32 %r3, %r1, %r2;
+; PTX-NEXT: st.global.u32 [%rd2], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_struct(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) {
-; OPT-NOT: alloca
-; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
-; OPT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
-; OPT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
-; OPT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
+; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
+; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
+; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
+; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4
+; OPT-NEXT: ret void
%gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%int1 = load i32, ptr %gep1
@@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-LABEL: grid_const_escape(
; PTX: {
-; PTX-NOT: .local
-; PTX: cvta.param.{{.*}}
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_escape_param_0;
+; PTX-NEXT: mov.u64 %rd2, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT: { // callseq 0, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd3;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r1, [retval0+0];
+; PTX-NEXT: } // callseq 0
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) {
-; OPT-NOT: alloca [[STRUCT_S]]
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
+; OPT-NEXT: ret void
%call = call i32 @escape(ptr %input)
ret void
}
define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
; PTX-LABEL: multiple_grid_const_escape(
-; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
-; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
-; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]];
-; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]];
-; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]];
-; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]];
-; PTX: {
-; PTX: st.param.b64 [param0+0], [[RD5]];
-; PTX: st.param.b64 [param2+0], [[RD6]];
-;
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot3[4];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.u64 %SPL, __local_depot3;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: mov.b64 %rd1, multiple_grid_const_escape_param_0;
+; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_2;
+; PTX-NEXT: mov.u64 %rd3, %rd2;
+; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
+; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT: mov.u64 %rd5, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT: add.u64 %rd7, %SP, 0;
+; PTX-NEXT: add.u64 %rd8, %SPL, 0;
+; PTX-NEXT: st.local.u32 [%rd8], %r1;
+; PTX-NEXT: { // callseq 1, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd6;
+; PTX-NEXT: .param .b64 param1;
+; PTX-NEXT: st.param.b64 [param1+0], %rd7;
+; PTX-NEXT: .param .b64 param2;
+; PTX-NEXT: st.param.b64 [param2+0], %rd4;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape3,
+; PTX-NEXT: (
+; PTX-NEXT: param0,
+; PTX-NEXT: param1,
+; PTX-NEXT: param2
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r2, [retval0+0];
+; PTX-NEXT: } // callseq 1
+; PTX-NEXT: ret;
; OPT-LABEL: define void @multiple_grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
-; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NOT: alloca %struct.s
-; OPT: [[A_ADDR:%.*]] = alloca i32, align 4
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
-;
+; OPT-NEXT: ret void
%a.addr = alloca i32, align 4
store i32 %a, ptr %a.addr, align 4
%call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
@@ -92,40 +158,58 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
; PTX-LABEL: grid_const_memory_escape(
-; PTX-NOT: .local
-; PTX: mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0;
-; PTX: cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]];
-; PTX: st.global.u64 [[[RD4:%.*]]], [[RD3]];
-;
+; PTX: {
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.u64 %rd4, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT: st.global.u64 [%rd3], %rd5;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_memory_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) {
-; OPT-NOT: alloca [[STRUCT_S]]
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
+; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8
+; OPT-NEXT: ret void
store ptr %input, ptr %addr, align 8
ret void
}
define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
; PTX-LABEL: grid_const_inlineasm_escape(
-; PTX-NOT .local
-; PTX: add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4;
-; PTX: cvta.param.u64 [[RD4:%.*]], [[RD2]]
-; PTX: cvta.param.u64 [[RD3:%.*]], [[RD1]]
-; PTX: add.s64 [[RD5:%.*]], [[RD3]], [[RD4]];
-;
+; PTX: {
+; PTX-NEXT: .reg .b64 %rd<8>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
+; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5;
+; PTX-NEXT: mov.u64 %rd7, %rd4;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd7;
+; PTX-NEXT: add.s64 %rd3, %rd2, 4;
+; PTX-NEXT: // begin inline asm
+; PTX-NEXT: add.s64 %rd1, %rd2, %rd3;
+; PTX-NEXT: // end inline asm
+; PTX-NEXT: st.global.u64 [%rd6], %rd1;
+; PTX-NEXT: ret;
+; PTX-NOT .local
; OPT-LABEL: define void @grid_const_inlineasm_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) {
-; OPT-NOT: alloca [[STRUCT_S]]
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0
-; OPT: [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1
-; OPT: [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]])
-; OPT: [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]])
-; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
+; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
+; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8
+; OPT-NEXT: ret void
%tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -135,24 +219,42 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escape(
-; PTX-NOT: .local
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0];
-; PTX: add.{{.*}}
-; PTX: cvta.param.u64 [[RD3:%.*]], {{%.*}}
-; PTX: st.param.{{.*}} [param0+0], [[RD3]]
-; PTX: call
-;
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<5>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escape_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escape_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.u64 %rd4, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT: ld.u32 %r1, [%rd5];
+; PTX-NEXT: add.s32 %r2, %r1, %r1;
+; PTX-NEXT: st.global.u32 [%rd3], %r2;
+; PTX-NEXT: { // callseq 2, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd5;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r3, [retval0+0];
+; PTX-NEXT: } // callseq 2
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_partial_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT: alloca
-; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4
-; OPT: [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]]
-; OPT: store i32 [[TWICE]]
-; OPT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
-; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
-; OPT: ret void
-;
+; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
+; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
+; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
+; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
+; OPT-NEXT: ret void
%val = load i32, ptr %input
%twice = add i32 %val, %val
store i32 %twice, ptr %output
@@ -163,27 +265,46 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escapemem(
; PTX: {
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0];
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4];
-; PTX: cvta.param.{{.*}} [[RD5:%.*]], {{%.*}};
-; PTX: st.global.{{.*}} [{{.*}}], [[RD5]];
-; PTX: add.s32 [[R3:%.*]], [[R1]], [[R2]]
-; PTX: st.param.{{.*}} [param0+0], [[RD5]]
-; PTX: escape
+; PTX-NEXT: .reg .b32 %r<6>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escapemem_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.u64 %rd4, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT: ld.u32 %r1, [%rd5];
+; PTX-NEXT: ld.u32 %r2, [%rd5+4];
+; PTX-NEXT: st.global.u64 [%rd3], %rd5;
+; PTX-NEXT: add.s32 %r3, %r1, %r2;
+; PTX-NEXT: { // callseq 3, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd5;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r4, [retval0+0];
+; PTX-NEXT: } // callseq 3
+; PTX-NEXT: st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define i32 @grid_const_partial_escapemem(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT: alloca
-; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0
-; OPT: [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4
-; OPT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1
-; OPT: [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4
-; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT: store ptr [[INPUT1]]
-; OPT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
-; OPT: [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]])
-; OPT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
+; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
+; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8
+; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
+; OPT-NEXT: ret i32 [[ADD]]
%ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%val1 = load i32, ptr %ptr1
%ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -194,29 +315,48 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
ret i32 %add
}
-define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape(
-; PTX: cvta.param.{{.*}} [[RD1:%.*]], {{.*}}
-; PTX: @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]];
-; PTX: $L__BB[[TARGET_LABEL]]:
-; PTX: ld.{{.*}} [[R1:%.*]], [[[RD1]]];
-;
-; OPT-LABEL: define void @grid_const_phi_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) {
-; OPT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
-; OPT: br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+; PTX-LABEL: grid_const_phi(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0;
+; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6;
+; PTX-NEXT: mov.u64 %rd7, %rd5;
+; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT: ld.global.u32 %r1, [%rd1];
+; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT: @%p1 bra $L__BB8_2;
+; PTX-NEXT: // %bb.1: // %second
+; PTX-NEXT: add.s64 %rd8, %rd8, 4;
+; PTX-NEXT: $L__BB8_2: // %merge
+; PTX-NEXT: ld.u32 %r2, [%rd8];
+; PTX-NEXT: st.global.u32 [%rd1], %r2;
+; PTX-NEXT: ret;
+; OPT-LABEL: define void @grid_const_phi(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
-; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
-; OPT: br label %[[MERGE:.*]]
+; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT: br label %[[MERGE:.*]]
; OPT: [[SECOND]]:
-; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
-; OPT: br label %[[MERGE]]
+; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT: br label %[[MERGE]]
; OPT: [[MERGE]]:
-; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-; OPT-NOT: load i32, ptr addrspace(101)
-; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
@@ -235,32 +375,53 @@ merge:
}
; NOTE: %input2 is *not* grid_constant
-define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape2(
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4];
-; PTX: @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]];
-; PTX: cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]];
-; PTX: ld.u32 [[R1]], [[[RD1]]];
-; PTX: $L__BB[[LABEL]]:
-; PTX: st.global.u32 [[[RD3:%.*]]], [[R1]]
-; OPT-LABEL: define void @grid_const_phi_escape2(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) {
-; OPT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; OPT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
-; OPT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
-; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT: br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_phi_ngc(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<12>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0;
+; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7;
+; PTX-NEXT: mov.u64 %rd10, %rd6;
+; PTX-NEXT: cvta.param.u64 %rd11, %rd10;
+; PTX-NEXT: ld.global.u32 %r1, [%rd1];
+; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT: @%p1 bra $L__BB9_2;
+; PTX-NEXT: // %bb.1: // %second
+; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1;
+; PTX-NEXT: mov.u64 %rd9, %rd8;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd9;
+; PTX-NEXT: add.s64 %rd11, %rd2, 4;
+; PTX-NEXT: $L__BB9_2: // %merge
+; PTX-NEXT: ld.u32 %r2, [%rd11];
+; PTX-NEXT: st.global.u32 [%rd1], %r2;
+; PTX-NEXT: ret;
+; OPT-LABEL: define void @grid_const_phi_ngc(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
-; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
-; OPT: br label %[[MERGE:.*]]
+; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT: br label %[[MERGE:.*]]
; OPT: [[SECOND]]:
-; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
-; OPT: br label %[[MERGE]]
+; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT: br label %[[MERGE]]
; OPT: [[MERGE]]:
-; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-;
+; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
br i1 %less, label %first, label %second
@@ -278,22 +439,42 @@ merge:
}
; NOTE: %input2 is *not* grid_constant
-define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_select_escape(
-; PTX: cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]]
-; PTX: setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0
-; PTX: add.{{.*}} [[RD3:%.*]], %SP, 0;
-; PTX: selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]];
-; PTX: ld.u32 {{%.*}}, [[[RD4]]];
-; OPT-LABEL: define void @grid_const_select_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) {
-; OPT: [[INPUT24:%.*]] = alloca i32, align 4
-; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT: load i32, ptr [[INOUT]]
-; OPT: [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]]
-; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_select(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<10>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1;
+; PTX-NEXT: mov.u64 %rd5, %rd4;
+; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT: mov.u64 %rd7, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT: ld.global.u32 %r1, [%rd3];
+; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1;
+; PTX-NEXT: ld.u32 %r2, [%rd9];
+; PTX-NEXT: st.global.u32 [%rd3], %r2;
+; PTX-NEXT: ret;
+; OPT-LABEL: define void @grid_const_select(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
+; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
%ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -304,16 +485,27 @@ define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(
define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
; PTX-LABEL: grid_const_ptrtoint(
-; PTX-NOT: .local
-; PTX: ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0];
-; PTX: cvta.param.u64 [[RD1:%.*]], {{%.*}}
-; PTX: cvt.u32.u64 {{%.*}}, [[RD1]]
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0;
+; PTX-NEXT: mov.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT: cvt.u32.u64 %r2, %rd3;
+; PTX-NEXT: add.s32 %r3, %r1, %r2;
+; PTX-NEXT: st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define i32 @grid_const_ptrtoint(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) {
-; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT2]]
-; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
+; OPT-NEXT: ret i32 [[KEEPALIVE]]
%val = load i32, ptr %input
%ptrval = ptrtoint ptr %input to i32
%keepalive = add i32 %val, %ptrval
@@ -352,13 +544,13 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
!15 = !{i32 1}
-!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17}
+!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17}
!17 = !{i32 1}
-!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19}
+!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19}
!19 = !{i32 1}
-!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21}
+!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21}
!21 = !{i32 1}
!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index f041f20..a414a6c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,166 +1,469 @@
-; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
-; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-%struct.ham = type { [4 x i32] }
-
-; // Verify that load with static offset into parameter is done directly.
-; CHECK-LABEL: .visible .entry static_offset
-; CHECK-NOT: .local
-; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-;
-; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-;
-; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12];
-; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
- %tmp = icmp eq i32 %arg2, 3
- br i1 %tmp, label %bb3, label %bb6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60
+; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70
+source_filename = "<stdin>"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0
-bb3: ; preds = %bb
- %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3
- %tmp5 = load i32, ptr %tmp4, align 4
- store i32 %tmp5, ptr %arg, align 4
- br label %bb6
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
-bb6: ; preds = %bb3, %bb
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
+; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT: ret void
+;
+entry:
+ %i = load i32, ptr %s, align 4
+ store i32 %i, ptr %out, align 4
ret void
}
-; // Verify that load with dynamic offset into parameter is also done directly.
-; CHECK-LABEL: .visible .entry dynamic_offset
-; CHECK-NOT: .local
-; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only_gep(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT: ret void
;
-; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32 %[[param_w_offset:r[0-9]+]], %[[param_addr1]],
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ %i = load i32, ptr %b, align 4
+ store i32 %i, ptr %out, align 4
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only_gep_asc(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT: ret void
;
-; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]];
-; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ %asc = addrspacecast ptr %b to ptr addrspace(101)
+ %i = load i32, ptr addrspace(101) %asc, align 4
+ store i32 %i, ptr %out, align 4
+ ret void
+}
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
- %tmp = sext i32 %arg2 to i64
- %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
- %tmp4 = load i32, ptr %tmp3, align 4
- store i32 %tmp4, ptr %arg, align 4
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @read_only_gep_asc0(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; COMMON-NEXT: ret void
+;
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ %asc = addrspacecast ptr %b to ptr addrspace(101)
+ %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
+ %i = load i32, ptr %asc0, align 4
+ store i32 %i, ptr %out, align 4
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr(
+; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]])
+; COMMON-NEXT: ret void
+;
+entry:
+ call void @_Z6escapePv(ptr noundef nonnull %s) #0
ret void
}
-; Same as above, but with a bitcast present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_param_0]
-; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr_gep(
+; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]])
+; COMMON-NEXT: ret void
;
-; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_param_0]
-; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_param_1
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ call void @_Z6escapePv(ptr noundef nonnull %b) #0
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr_store(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8
+; COMMON-NEXT: ret void
;
-; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_param_2]
-; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]];
-; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]];
+entry:
+ store ptr %s, ptr %out, align 8
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptr_gep_store(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; COMMON-NEXT: store ptr [[B]], ptr [[OUT2]], align 8
+; COMMON-NEXT: ret void
+;
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ store ptr %b, ptr %out, align 8
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @escape_ptrtoint(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; COMMON-NEXT: store i64 [[I]], ptr [[OUT2]], align 8
+; COMMON-NEXT: ret void
+;
+entry:
+ %i = ptrtoint ptr %s to i64
+ store i64 %i, ptr %out, align 8
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @memcpy_from_param(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; COMMON-NEXT: ret void
+;
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @memcpy_to_param(
+; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[ENTRY:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
+; COMMON-NEXT: ret void
+;
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @copy_on_store(
+; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT: [[BB:.*:]]
+; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
+; COMMON-NEXT: store i32 [[I]], ptr [[S3]], align 4
+; COMMON-NEXT: ret void
;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
bb:
- %n64 = sext i32 %n to i64
- %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
- %load = load i8, ptr %gep, align 4
- store i8 %load, ptr %out, align 4
+ %i = load i32, ptr %in, align 4
+ store i32 %i, ptr %s, align 4
ret void
}
-; Same as above, but with an ASC(101) present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast_asc
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_asc_param_1
+define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; SM_60-LABEL: define void @test_select(
+; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; SM_60-NEXT: [[BB:.*:]]
+; SM_60-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
+; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
+; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
+; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
+; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
+; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
+; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4
+; SM_60-NEXT: ret void
;
-; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_asc_param_1
+; SM_70-LABEL: define void @test_select(
+; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; SM_70-NEXT: [[BB:.*:]]
+; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
+; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4
+; SM_70-NEXT: ret void
;
-; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_asc_param_2]
-; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]];
-; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]];
+bb:
+ %ptrnew = select i1 %cond, ptr %input1, ptr %input2
+ %valloaded = load i32, ptr %ptrnew, align 4
+ store i32 %valloaded, ptr %out, align 4
+ ret void
+}
+
+define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) {
+; COMMON-LABEL: define void @test_select_write(
+; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] {
+; COMMON-NEXT: [[BB:.*:]]
+; COMMON-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr
+; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4
+; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4
+; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4
+; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
+; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
+; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
+; COMMON-NEXT: ret void
;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
bb:
- %n64 = sext i32 %n to i64
- %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
- %asc = addrspacecast ptr %gep to ptr addrspace(101)
- %load = load i8, ptr addrspace(101) %asc, align 4
- store i8 %load, ptr %out, align 4
- ret void
-}
-
-
-; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
-; CHECK-LABEL: .visible .entry pointer_escapes
-; CHECK: .local .align 4 .b8 __local_depot{{.*}}
-; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0;
-; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: add.u32 %[[copy_addr:r[0-9]+]], %SPL, 0;
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12];
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8];
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4];
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1];
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12],
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8],
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4],
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]],
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32 %[[copy_w_offset:r[0-9]+]], %[[copy_addr]],
-; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]];
-; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
-
-; Function Attrs: convergent norecurse nounwind mustprogress
-define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+ %ptrnew = select i1 %cond, ptr %input1, ptr %input2
+ store i32 1, ptr %ptrnew, align 4
+ ret void
+}
+
+define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) {
+; SM_60-LABEL: define void @test_phi(
+; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
+; SM_60-NEXT: [[BB:.*:]]
+; SM_60-NEXT: [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr
+; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
+; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
+; SM_60-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
+; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_60-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
+; SM_60-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
+; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; SM_60: [[FIRST]]:
+; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
+; SM_60-NEXT: br label %[[MERGE:.*]]
+; SM_60: [[SECOND]]:
+; SM_60-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
+; SM_60-NEXT: br label %[[MERGE]]
+; SM_60: [[MERGE]]:
+; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT8]], align 4
+; SM_60-NEXT: ret void
+;
+; SM_70-LABEL: define void @test_phi(
+; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
+; SM_70-NEXT: [[BB:.*:]]
+; SM_70-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; SM_70-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; SM_70: [[FIRST]]:
+; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; SM_70-NEXT: br label %[[MERGE:.*]]
+; SM_70: [[SECOND]]:
+; SM_70-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
+; SM_70-NEXT: br label %[[MERGE]]
+; SM_70: [[MERGE]]:
+; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; SM_70-NEXT: ret void
+;
bb:
- %tmp = sext i32 %arg2 to i64
- %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
- %tmp4 = load i32, ptr %tmp3, align 4
- store i32 %tmp4, ptr %arg, align 4
- %tmp5 = call ptr @escape(ptr nonnull %tmp3) #3
+ br i1 %cond, label %first, label %second
+
+first: ; preds = %bb
+ %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0
+ br label %merge
+
+second: ; preds = %bb
+ %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1
+ br label %merge
+
+merge: ; preds = %second, %first
+ %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ]
+ %valloaded = load i32, ptr %ptrnew, align 4
+ store i32 %valloaded, ptr %inout, align 4
ret void
}
-; Function Attrs: convergent nounwind
-declare dso_local ptr @escape(ptr) local_unnamed_addr
+define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) {
+; COMMON-LABEL: define void @test_phi_write(
+; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] {
+; COMMON-NEXT: [[BB:.*:]]
+; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
+; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
+; COMMON-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
+; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; COMMON-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4
+; COMMON-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4
+; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; COMMON: [[FIRST]]:
+; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
+; COMMON-NEXT: br label %[[MERGE:.*]]
+; COMMON: [[SECOND]]:
+; COMMON-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
+; COMMON-NEXT: br label %[[MERGE]]
+; COMMON: [[MERGE]]:
+; COMMON-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
+; COMMON-NEXT: ret void
+;
+bb:
+ br i1 %cond, label %first, label %second
+
+first: ; preds = %bb
+ %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0
+ br label %merge
+
+second: ; preds = %bb
+ %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1
+ br label %merge
+
+merge: ; preds = %second, %first
+ %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ]
+ store i32 1, ptr %ptrnew, align 4
+ ret void
+}
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !6, !7}
+!llvm.module.flags = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!llvm.ident = !{!20, !21}
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{ptr @static_offset, !"kernel", i32 1}
-!4 = !{ptr @dynamic_offset, !"kernel", i32 1}
-!5 = !{ptr @pointer_escapes, !"kernel", i32 1}
-!6 = !{ptr @gep_bitcast, !"kernel", i32 1}
-!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1}
+!3 = !{i32 7, !"frame-pointer", i32 2}
+!4 = !{ptr @read_only, !"kernel", i32 1}
+!5 = !{ptr @escape_ptr, !"kernel", i32 1}
+!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1}
+!7 = !{ptr @escape_ptr_store, !"kernel", i32 1}
+!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1}
+!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1}
+!10 = !{ptr @memcpy_from_param, !"kernel", i32 1}
+!11 = !{ptr @memcpy_to_param, !"kernel", i32 1}
+!12 = !{ptr @copy_on_store, !"kernel", i32 1}
+!13 = !{ptr @read_only_gep, !"kernel", i32 1}
+!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1}
+!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1}
+!16 = !{ptr @test_select, !"kernel", i32 1}
+!17 = !{ptr @test_phi, !"kernel", i32 1}
+!18 = !{ptr @test_phi_write, !"kernel", i32 1}
+!19 = !{ptr @test_select_write, !"kernel", i32 1}
+!20 = !{!"clang version 20.0.0git"}
+!21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
diff --git a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir
index 8a83543..fd3630b 100644
--- a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir
+++ b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir
@@ -18,6 +18,9 @@
define i32 @foo2(i32 %a, i32 %b) comdat { ret i32 0 }
define i32 @foo3(i32 %a, i32 %b) section ".abc" { ret i32 0 }
+
+ define i32 @foo4(i32 %a, i32 %b) !section_prefix !0 { ret i32 0 }
+ !0 = !{!"function_section_prefix", !"myprefix"}
...
---
name: foo
@@ -27,23 +30,24 @@ body: |
; CHECK: bb.0:
; CHECK-NEXT: liveins: $x10, $x11, $x13
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
; CHECK-NEXT: PseudoBR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $x10, $x11, $x13
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
; CHECK-NEXT: PseudoBR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: liveins: $x10, $x11, $x13
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13
; CHECK-NEXT: PseudoBR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: PseudoRET
+ ;
; CHECK-FS-LABEL: name: foo
; CHECK-FS: bb.0:
; CHECK-FS-NEXT: liveins: $x10, $x11, $x13
@@ -109,26 +113,27 @@ body: |
; CHECK: bb.0:
; CHECK-NEXT: liveins: $x10, $x11, $x13
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
; CHECK-NEXT: PseudoBR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: liveins: $x10, $x11, $x13
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
; CHECK-NEXT: PseudoBR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: liveins: $x10, $x11, $x13
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
; CHECK-NEXT: PseudoBR %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: PseudoRET
+ ;
; CHECK-FS-LABEL: name: foo2
; CHECK-FS: bb.0:
; CHECK-FS-NEXT: liveins: $x10, $x11, $x13
@@ -223,6 +228,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: PseudoRET
+ ;
; CHECK-FS-LABEL: name: foo3
; CHECK-FS: bb.0:
; CHECK-FS-NEXT: liveins: $x10, $x11, $x13
@@ -289,3 +295,89 @@ body: |
bb.3:
PseudoRET
...
+---
+name: foo4
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: name: foo4
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: liveins: $x10, $x11, $x13
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ ; CHECK-NEXT: PseudoBR %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $x10, $x11, $x13
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ ; CHECK-NEXT: PseudoBR %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $x10, $x11, $x13
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ ; CHECK-NEXT: PseudoBR %bb.3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3:
+ ; CHECK-NEXT: PseudoRET
+ ;
+ ; CHECK-FS-LABEL: name: foo4
+ ; CHECK-FS: bb.0:
+ ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13
+ ; CHECK-FS-NEXT: {{ $}}
+ ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ ; CHECK-FS-NEXT: PseudoBR %bb.3
+ ; CHECK-FS-NEXT: {{ $}}
+ ; CHECK-FS-NEXT: bb.1:
+ ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13
+ ; CHECK-FS-NEXT: {{ $}}
+ ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ ; CHECK-FS-NEXT: PseudoBR %bb.3
+ ; CHECK-FS-NEXT: {{ $}}
+ ; CHECK-FS-NEXT: bb.2:
+ ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13
+ ; CHECK-FS-NEXT: {{ $}}
+ ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+ ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ ; CHECK-FS-NEXT: PseudoBR %bb.3
+ ; CHECK-FS-NEXT: {{ $}}
+ ; CHECK-FS-NEXT: bb.3:
+ ; CHECK-FS-NEXT: PseudoRET
+ bb.0:
+ liveins: $x10, $x11, $x13
+
+ $x11 = ORI $x11, 1023
+ $x12 = ADDI $x10, 17
+ $x11 = AND $x12, $x11
+ $x10 = SUB $x10, $x11
+ $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ PseudoBR %bb.3
+
+ bb.1:
+ liveins: $x10, $x11, $x13
+
+ $x11 = ORI $x11, 1023
+ $x12 = ADDI $x10, 17
+ $x11 = AND $x12, $x11
+ $x10 = SUB $x10, $x11
+ $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ PseudoBR %bb.3
+
+ bb.2:
+ liveins: $x10, $x11, $x13
+
+ $x11 = ORI $x11, 1023
+ $x12 = ADDI $x10, 17
+ $x11 = AND $x12, $x11
+ $x10 = SUB $x10, $x11
+ $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) <mcsymbol .Lpcrel_hi1> :: (dereferenceable load (s32) from @bar)
+ PseudoBR %bb.3
+
+ bb.3:
+ PseudoRET
+...
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index c0b14d2..5f0ba4a 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -1,5 +1,4 @@
; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s
-; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s
declare float @llvm.wasm.loadf32.f16(ptr)
declare void @llvm.wasm.storef16.f32(float, ptr)
@@ -28,6 +27,13 @@ define <8 x half> @splat_v8f16(float %x) {
ret <8 x half> %v
}
+; CHECK-LABEL: const_splat_v8f16:
+; CHECK: v128.const $push0=, 20800, 0, 0, 0, 0, 0, 0, 20800
+; CHECK-NEXT: return $pop0
+define <8 x half> @const_splat_v8f16() {
+ ret <8 x half> <half 42., half 0., half 0., half 0., half 0., half 0., half 0., half 42.>
+}
+
; CHECK-LABEL: extract_lane_v8f16:
; CHECK: f16x8.extract_lane $push0=, $0, 1
; CHECK-NEXT: return $pop0
@@ -308,3 +314,24 @@ define <8 x i16> @trunc_sat_u_v8i16_sat(<8 x half> %x) {
%a = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %x)
ret <8 x i16> %a
}
+
+; ==============================================================================
+; Load and Store
+; ==============================================================================
+define <8 x half> @load_v8f16(ptr %p) {
+; CHECK-LABEL: load_v8f16:
+; CHECK: .functype load_v8f16 (i32) -> (v128)
+; CHECK-NEXT: v128.load $push0=, 0($0)
+; CHECK-NEXT: return $pop0
+ %v = load <8 x half>, ptr %p
+ ret <8 x half> %v
+}
+
+define void @store_v8f16(<8 x half> %v, ptr %p) {
+; CHECK-LABEL: store_v8f16:
+; CHECK: .functype store_v8f16 (v128, i32) -> ()
+; CHECK-NEXT: v128.store 0($1), $0
+; CHECK-NEXT: return
+ store <8 x half> %v , ptr %p
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
index 430b0db..d1d1b0a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -7,8 +7,8 @@ target triple = "aarch64--linux-gnu"
%pair = type { i8, i8 }
; CHECK-LABEL: test
-; CHECK: Found an estimated cost of 16 for VF 2 For instruction: {{.*}} load i8
-; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
+; CHECK: Found an estimated cost of 8 for VF 2 For instruction: {{.*}} load i8
+; CHECK: Found an estimated cost of 8 for VF 2 For instruction: {{.*}} load i8
; CHECK-LABEL: entry:
; CHECK-LABEL: vector.body:
; CHECK: [[LOAD1:%.*]] = load i8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index 21af9ae..dec124b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -168,8 +168,8 @@ entry:
; gaps.
;
; VF_2-LABEL: Checking a loop in 'i64_factor_8'
-; VF_2: Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_2: Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
for.body:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
index c7a04e3..976c6a9 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll
@@ -15,10 +15,10 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i8_factor_2'
-; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
-; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1
; VF_4-LABEL: Checking a loop in 'i8_factor_2'
; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1
@@ -56,10 +56,10 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i16_factor_2'
-; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2
; VF_4-LABEL: Checking a loop in 'i16_factor_2'
; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2
@@ -97,10 +97,10 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i32_factor_2'
-; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4
; VF_4-LABEL: Checking a loop in 'i32_factor_2'
; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4
@@ -138,25 +138,25 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i64_factor_2'
-; VF_2: Found an estimated cost of 44 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 44 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8
; VF_4-LABEL: Checking a loop in 'i64_factor_2'
-; VF_4: Found an estimated cost of 88 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 88 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8
; VF_8-LABEL: Checking a loop in 'i64_factor_2'
-; VF_8: Found an estimated cost of 176 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 176 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8
; VF_16-LABEL: Checking a loop in 'i64_factor_2'
-; VF_16: Found an estimated cost of 352 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 352 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
+; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0
@@ -179,10 +179,10 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'f16_factor_2'
-; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
+; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2
; VF_4-LABEL: Checking a loop in 'f16_factor_2'
; VF_4: Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2
@@ -261,25 +261,25 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'f64_factor_2'
-; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8
; VF_4-LABEL: Checking a loop in 'f64_factor_2'
-; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8
; VF_8-LABEL: Checking a loop in 'f64_factor_2'
-; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8
; VF_16-LABEL: Checking a loop in 'f64_factor_2'
-; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
+; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0
@@ -306,33 +306,33 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i8_factor_3'
-; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1
; VF_4-LABEL: Checking a loop in 'i8_factor_3'
-; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_4-NEXT: Found an estimated cost of 72 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1
; VF_8-LABEL: Checking a loop in 'i8_factor_3'
-; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1
; VF_16-LABEL: Checking a loop in 'i8_factor_3'
-; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
-; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
+; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0
@@ -358,33 +358,33 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i16_factor_3'
-; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2
; VF_4-LABEL: Checking a loop in 'i16_factor_3'
-; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_4-NEXT: Found an estimated cost of 72 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2
; VF_8-LABEL: Checking a loop in 'i16_factor_3'
-; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2
; VF_16-LABEL: Checking a loop in 'i16_factor_3'
-; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
-; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
+; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0
@@ -410,33 +410,33 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i32_factor_3'
-; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4
; VF_4-LABEL: Checking a loop in 'i32_factor_3'
-; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4
; VF_8-LABEL: Checking a loop in 'i32_factor_3'
-; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4
; VF_16-LABEL: Checking a loop in 'i32_factor_3'
-; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
-; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
+; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0
@@ -462,33 +462,33 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i64_factor_3'
-; VF_2: Found an estimated cost of 66 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 66 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8
; VF_4-LABEL: Checking a loop in 'i64_factor_3'
-; VF_4: Found an estimated cost of 132 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 132 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8
; VF_8-LABEL: Checking a loop in 'i64_factor_3'
-; VF_8: Found an estimated cost of 264 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 264 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8
; VF_16-LABEL: Checking a loop in 'i64_factor_3'
-; VF_16: Found an estimated cost of 528 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 528 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
+; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0
@@ -514,12 +514,12 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'f16_factor_3'
-; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
+; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2
; VF_4-LABEL: Checking a loop in 'f16_factor_3'
; VF_4: Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2
@@ -573,12 +573,12 @@ entry:
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float %tmp4, ptr %tmp1, align 4
; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store float %tmp5, ptr %tmp2, align 4
; VF_4-LABEL: Checking a loop in 'f32_factor_3'
-; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4
+; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4
; VF_8-LABEL: Checking a loop in 'f32_factor_3'
; VF_8: Found an estimated cost of 64 for VF 8 For instruction: %tmp3 = load float, ptr %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, ptr %tmp1, align 4
@@ -618,33 +618,33 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'f64_factor_3'
-; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8
; VF_4-LABEL: Checking a loop in 'f64_factor_3'
-; VF_4: Found an estimated cost of 36 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 36 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8
; VF_8-LABEL: Checking a loop in 'f64_factor_3'
-; VF_8: Found an estimated cost of 72 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 72 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8
; VF_16-LABEL: Checking a loop in 'f64_factor_3'
-; VF_16: Found an estimated cost of 144 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 144 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
+; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0
@@ -673,41 +673,41 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i8_factor_4'
-; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1
; VF_4-LABEL: Checking a loop in 'i8_factor_4'
-; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_4-NEXT: Found an estimated cost of 96 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1
; VF_8-LABEL: Checking a loop in 'i8_factor_4'
-; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1
; VF_16-LABEL: Checking a loop in 'i8_factor_4'
-; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
-; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
+; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0
@@ -736,41 +736,41 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i16_factor_4'
-; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2
; VF_4-LABEL: Checking a loop in 'i16_factor_4'
-; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_4-NEXT: Found an estimated cost of 96 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2
; VF_8-LABEL: Checking a loop in 'i16_factor_4'
-; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2
; VF_16-LABEL: Checking a loop in 'i16_factor_4'
-; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
-; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
+; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0
@@ -799,41 +799,41 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i32_factor_4'
-; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4
; VF_4-LABEL: Checking a loop in 'i32_factor_4'
-; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4
; VF_8-LABEL: Checking a loop in 'i32_factor_4'
-; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4
; VF_16-LABEL: Checking a loop in 'i32_factor_4'
-; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
-; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
+; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4
+; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0
@@ -862,41 +862,41 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'i64_factor_4'
-; VF_2: Found an estimated cost of 88 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT: Found an estimated cost of 88 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8
; VF_4-LABEL: Checking a loop in 'i64_factor_4'
-; VF_4: Found an estimated cost of 176 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT: Found an estimated cost of 176 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8
; VF_8-LABEL: Checking a loop in 'i64_factor_4'
-; VF_8: Found an estimated cost of 352 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT: Found an estimated cost of 352 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8
; VF_16-LABEL: Checking a loop in 'i64_factor_4'
-; VF_16: Found an estimated cost of 704 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT: Found an estimated cost of 704 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
+; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0
@@ -997,14 +997,14 @@ entry:
; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float %tmp6, ptr %tmp2, align 4
; VF_2-NEXT: Found an estimated cost of 20 for VF 2 For instruction: store float %tmp7, ptr %tmp3, align 4
; VF_4-LABEL: Checking a loop in 'f32_factor_4'
-; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4
-; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4
+; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4
+; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4
; VF_8-LABEL: Checking a loop in 'f32_factor_4'
; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp4 = load float, ptr %tmp0, align 4
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, ptr %tmp1, align 4
@@ -1051,41 +1051,41 @@ entry:
br label %for.body
; VF_2-LABEL: Checking a loop in 'f64_factor_4'
-; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8
; VF_4-LABEL: Checking a loop in 'f64_factor_4'
-; VF_4: Found an estimated cost of 48 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_4-NEXT: Found an estimated cost of 48 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8
; VF_8-LABEL: Checking a loop in 'f64_factor_4'
-; VF_8: Found an estimated cost of 96 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_8-NEXT: Found an estimated cost of 96 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8
; VF_16-LABEL: Checking a loop in 'f64_factor_4'
-; VF_16: Found an estimated cost of 192 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
-; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
-; VF_16-NEXT: Found an estimated cost of 192 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
+; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8
+; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8
for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 6d309c4..df02cb7 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -176,7 +176,7 @@ declare i16 @llvm.umax.i16(i16, i16)
; Test case for https://github.com/llvm/llvm-project/issues/106780.
define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 {
; CHECK-LABEL: define i32 @cost_of_exit_branch_and_cond_insts(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[X]] to i32
; CHECK-NEXT: [[UMAX3:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP0]], i32 111)
@@ -404,6 +404,95 @@ exit:
ret void
}
+; Test for https://github.com/llvm/llvm-project/issues/108098.
+define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define void @gather_interleave_group_with_dead_insert_pos(
+; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 3
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 16
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 16, i64 [[N_MOD_VF]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT: [[TMP11:%.*]] = zext <8 x i8> [[STRIDED_VEC4]] to <8 x i32>
+; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[STRIDED_VEC5]] to <8 x i32>
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[VEC_IND]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[STEP_ADD]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP11]], <8 x ptr> [[TMP13]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP12]], <8 x ptr> [[TMP14]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: br label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT: [[L_DEAD:%.*]] = load i8, ptr [[GEP_SRC_0]], align 1
+; CHECK-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1
+; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
+; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1
+; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[L_1]] to i32
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT: store i32 [[EXT]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 4
+; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+ %gep.src.0 = getelementptr i8, ptr %src, i64 %iv
+ %l.dead = load i8, ptr %gep.src.0, align 1
+ %iv.1 = add i64 %iv, 1
+ %gep.src.1 = getelementptr i8, ptr %src, i64 %iv.1
+ %l.1 = load i8, ptr %gep.src.1, align 1
+ %ext = zext i8 %l.1 to i32
+ %gep.dst = getelementptr i32, ptr %dst, i64 %iv
+ store i32 %ext, ptr %gep.dst, align 4
+ %iv.next = add nsw i64 %iv, 4
+ %ec = icmp slt i64 %iv, %N
+ br i1 %ec, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+attributes #0 = { "target-features"="+64bit,+v" }
+
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -425,4 +514,6 @@ exit:
; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]}
; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]}
; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]}
+; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]}
+; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]}
;.
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll
index 88eb9c4..13c443c 100644
--- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll
@@ -6,7 +6,8 @@
; Check cost function for <8 x i128> store interleave group.
; CHECK: LV: Checking a loop in 'fun'
-; CHECK: LV: Found an estimated cost of 8 for VF 4 For instruction: store i128 8721036757475490113
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i128 8721036757475490113
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i128 8721036757475490113
define noundef i32 @fun(i32 %argc, ptr nocapture readnone %argv) {
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
new file mode 100644
index 0000000..3d2c2e5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+; Check that we correctly handle the use of %start2 in the exit block, and do
+; not crash.
+
+define i64 @foo(ptr %p1, ptr %p2, i64 %start, i64 %end) {
+; CHECK-LABEL: define i64 @foo(
+; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]], i64 [[START:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[START2:%.*]] = and i64 [[START]], 12345
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END]], [[START2]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[START2]], [[N_VEC]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[START2]], [[INDEX]]
+; CHECK-NEXT: [[IND:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START2]], %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IND1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IND_NEXT1:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND1]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND1]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[IND_NEXT1]] = add i64 [[IND1]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IND_NEXT1]], [[END]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[USE:%.*]] = phi i64 [ [[START2]], %[[FOR_BODY]] ], [ [[START2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i64 [[USE]]
+;
+entry:
+ %start2 = and i64 %start, 12345
+ br label %for.body
+
+for.body:
+ %ind = phi i64 [ %start2, %entry ], [ %ind.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %ind
+ %0 = load i32, ptr %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %ind
+ %1 = load i32, ptr %arrayidx2, align 4
+ %ind.next = add i64 %ind, 1
+ %cmp = icmp ne i64 %ind.next, %end
+ br i1 %cmp, label %for.body, label %exit
+
+exit:
+ %use = phi i64 [ %start2, %for.body ]
+ ret i64 %use
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
index 7959e4d..56ed92e 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
@@ -4,6 +4,7 @@
; There is no scenario currently of doing ctx profile use without thinlto.
;
; RUN: opt -passes='thinlto-pre-link<O2>' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s
+; RUN: opt -debug-info-for-profiling -passes='thinlto-pre-link<O2>' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s
declare void @bar()
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index d28af85..a3a62f0 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -11,7 +11,23 @@
// llvm-dis [options] x.bc - Read LLVM bitcode from the x.bc file, write asm
// to the x.ll file.
// Options:
-// --help - Output information about command line switches
+//
+// Color Options:
+// --color - Use colors in output (default=autodetect)
+//
+// Disassembler Options:
+// -f - Enable binary output on terminals
+// --materialize-metadata - Load module without materializing metadata,
+// then materialize only the metadata
+// -o <filename> - Override output filename
+// --show-annotations - Add informational comments to the .ll file
+//
+// Generic Options:
+// --help - Display available options
+// (--help-hidden for more)
+// --help-list - Display list of available options
+// (--help-list-hidden for more)
+// --version - Display the version of this program
//
//===----------------------------------------------------------------------===//
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index b76d24d..148afd9 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -1354,14 +1354,18 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
auto *BB = &*F->begin();
auto It = BB->begin();
auto *Select = cast<sandboxir::SelectInst>(&*It++);
+ const auto *ConstSelect = Select; // To test the const getters.
auto *Ret = &*It++;
// Check getCondition().
EXPECT_EQ(Select->getCondition(), Cond0);
+ EXPECT_EQ(ConstSelect->getCondition(), Cond0);
// Check getTrueValue().
EXPECT_EQ(Select->getTrueValue(), V0);
+ EXPECT_EQ(ConstSelect->getTrueValue(), V0);
// Check getFalseValue().
EXPECT_EQ(Select->getFalseValue(), V1);
+ EXPECT_EQ(ConstSelect->getFalseValue(), V1);
// Check setCondition().
Select->setCondition(Cond1);
EXPECT_EQ(Select->getCondition(), Cond1);
@@ -1371,6 +1375,13 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) {
// Check setFalseValue().
Select->setFalseValue(V0);
EXPECT_EQ(Select->getFalseValue(), V0);
+ // Check swapValues().
+ Select->swapValues();
+ EXPECT_EQ(Select->getTrueValue(), V0);
+ EXPECT_EQ(Select->getFalseValue(), V1);
+ // Check areInvalidOperands.
+ EXPECT_EQ(sandboxir::SelectInst::areInvalidOperands(Cond0, V0, V1), nullptr);
+ EXPECT_NE(sandboxir::SelectInst::areInvalidOperands(V0, V1, Cond0), nullptr);
{
// Check SelectInst::create() InsertBefore.
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index a1f39fe..a1a4117 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -964,6 +964,32 @@ define void @foo(i32 %cond0, i32 %cond1) {
EXPECT_EQ(Switch->findCaseDest(BB1), One);
}
+TEST_F(TrackerTest, SelectInst) {
+ parseIR(C, R"IR(
+define void @foo(i1 %c0, i8 %v0, i8 %v1) {
+ %sel = select i1 %c0, i8 %v0, i8 %v1
+ ret void
+}
+)IR");
+ llvm::Function *LLVMF = &*M->getFunction("foo");
+ sandboxir::Context Ctx(C);
+ sandboxir::Function *F = Ctx.createFunction(LLVMF);
+ auto *V0 = F->getArg(1);
+ auto *V1 = F->getArg(2);
+ auto *BB = &*F->begin();
+ auto It = BB->begin();
+ auto *Select = cast<sandboxir::SelectInst>(&*It++);
+
+ // Check tracking for swapValues.
+ Ctx.save();
+ Select->swapValues();
+ EXPECT_EQ(Select->getTrueValue(), V1);
+ EXPECT_EQ(Select->getFalseValue(), V0);
+ Ctx.revert();
+ EXPECT_EQ(Select->getTrueValue(), V0);
+ EXPECT_EQ(Select->getFalseValue(), V1);
+}
+
TEST_F(TrackerTest, ShuffleVectorInst) {
parseIR(C, R"IR(
define void @foo(<2 x i8> %v1, <2 x i8> %v2) {
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index c5849b6..ef51864 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -504,7 +504,7 @@ struct MatchableInfo {
/// TheDef - This is the definition of the instruction or InstAlias that this
/// matchable came from.
- Record *const TheDef;
+ const Record *const TheDef;
// ResInstSize - The size of the resulting instruction for this matchable.
unsigned ResInstSize;
@@ -762,7 +762,7 @@ public:
RegisterClassesTy RegisterClasses;
/// Map of Predicate records to their subtarget information.
- std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
+ SubtargetFeatureInfoMap SubtargetFeatures;
/// Map of AsmOperandClass records to their class information.
std::map<const Record *, ClassInfo *> AsmOperandClasses;
@@ -1338,7 +1338,7 @@ void AsmMatcherInfo::buildRegisterClasses(
// Name the register classes which correspond to a user defined RegisterClass.
for (const CodeGenRegisterClass &RC : RegClassList) {
// Def will be NULL for non-user defined register classes.
- Record *Def = RC.getDef();
+ const Record *Def = RC.getDef();
if (!Def)
continue;
ClassInfo *CI = RegisterSetClasses[RegisterSet(RC.getOrder().begin(),
@@ -1513,8 +1513,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
void AsmMatcherInfo::buildInfo() {
// Build information about all of the AssemblerPredicates.
- const std::vector<std::pair<Record *, SubtargetFeatureInfo>>
- &SubtargetFeaturePairs = SubtargetFeatureInfo::getAll(Records);
+ SubtargetFeaturesInfoVec SubtargetFeaturePairs =
+ SubtargetFeatureInfo::getAll(Records);
SubtargetFeatures.insert(SubtargetFeaturePairs.begin(),
SubtargetFeaturePairs.end());
#ifndef NDEBUG
@@ -3226,9 +3226,9 @@ static void emitMatchClassKindNames(std::forward_list<ClassInfo> &Infos,
}
static std::string
-getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
+getNameForFeatureBitset(ArrayRef<const Record *> FeatureBitset) {
std::string Name = "AMFBS";
- for (const auto &Feature : FeatureBitset)
+ for (const Record *Feature : FeatureBitset)
Name += ("_" + Feature->getName()).str();
return Name;
}
@@ -3451,7 +3451,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
StringTable.EmitString(OS);
OS << ";\n\n";
- std::vector<std::vector<Record *>> FeatureBitsets;
+ std::vector<std::vector<const Record *>> FeatureBitsets;
for (const auto &MI : Info.Matchables) {
if (MI->RequiredFeatures.empty())
continue;
@@ -3460,8 +3460,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
FeatureBitsets.back().push_back(MI->RequiredFeatures[I]->TheDef);
}
- llvm::sort(FeatureBitsets, [&](const std::vector<Record *> &A,
- const std::vector<Record *> &B) {
+ llvm::sort(FeatureBitsets, [&](const std::vector<const Record *> &A,
+ const std::vector<const Record *> &B) {
if (A.size() < B.size())
return true;
if (A.size() > B.size())
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index 88acd79..69ca9a8 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -403,7 +403,7 @@ void CodeEmitterGen::emitInstructionBaseValues(
<< HWM.getModeName(HwMode, /*IncludeDefault=*/true) << "[] = {\n";
for (const CodeGenInstruction *CGI : NumberedInstructions) {
- Record *R = CGI->TheDef;
+ const Record *R = CGI->TheDef;
if (R->getValueAsString("Namespace") == "TargetOpcode" ||
R->getValueAsBit("isPseudo")) {
@@ -485,7 +485,7 @@ void CodeEmitterGen::run(raw_ostream &o) {
std::set<unsigned> HwModes;
BitWidth = 0;
for (const CodeGenInstruction *CGI : NumberedInstructions) {
- Record *R = CGI->TheDef;
+ const Record *R = CGI->TheDef;
if (R->getValueAsString("Namespace") == "TargetOpcode" ||
R->getValueAsBit("isPseudo"))
continue;
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index fbf1d47..46aad7f 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -185,7 +185,7 @@ private:
// KeyInstrVec - list of key instructions.
std::vector<Record *> KeyInstrVec;
- DenseMap<Record *, std::vector<Record *>> MapTable;
+ DenseMap<const Record *, std::vector<Record *>> MapTable;
public:
MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec)
@@ -371,7 +371,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
// emitted as first column.
OS << "Table[][" << NumCol + 1 << "] = {\n";
for (unsigned i = 0; i < TotalNumInstr; i++) {
- Record *CurInstr = NumberedInstructions[i]->TheDef;
+ const Record *CurInstr = NumberedInstructions[i]->TheDef;
std::vector<Record *> ColInstrs = MapTable[CurInstr];
std::string OutStr;
unsigned RelExists = 0;
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 4582478..a77e247 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -1255,28 +1255,28 @@ bool TreePredicateFn::isAtomicOrderingWeakerThanRelease() const {
false);
}
Record *TreePredicateFn::getMemoryVT() const {
- Record *R = getOrigPatFragRecord()->getRecord();
+ const Record *R = getOrigPatFragRecord()->getRecord();
if (R->isValueUnset("MemoryVT"))
return nullptr;
return R->getValueAsDef("MemoryVT");
}
ListInit *TreePredicateFn::getAddressSpaces() const {
- Record *R = getOrigPatFragRecord()->getRecord();
+ const Record *R = getOrigPatFragRecord()->getRecord();
if (R->isValueUnset("AddressSpaces"))
return nullptr;
return R->getValueAsListInit("AddressSpaces");
}
int64_t TreePredicateFn::getMinAlignment() const {
- Record *R = getOrigPatFragRecord()->getRecord();
+ const Record *R = getOrigPatFragRecord()->getRecord();
if (R->isValueUnset("MinAlignment"))
return 0;
return R->getValueAsInt("MinAlignment");
}
Record *TreePredicateFn::getScalarMemoryVT() const {
- Record *R = getOrigPatFragRecord()->getRecord();
+ const Record *R = getOrigPatFragRecord()->getRecord();
if (R->isValueUnset("ScalarMemoryVT"))
return nullptr;
return R->getValueAsDef("ScalarMemoryVT");
@@ -1390,7 +1390,7 @@ std::string TreePredicateFn::getCodeToRunOnSDNode() const {
if (Tree->isLeaf())
TreeClassName = "SDNode";
else {
- Record *Op = Tree->getOperator();
+ const Record *Op = Tree->getOperator();
const SDNodeInfo &Info = PatFragRec->getDAGPatterns().getSDNodeInfo(Op);
TreeClassName = Info.getSDClassName();
}
@@ -1848,7 +1848,8 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const {
// TreePatternNode implementation
//
-static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
+static unsigned GetNumNodeResults(const Record *Operator,
+ CodeGenDAGPatterns &CDP) {
if (Operator->getName() == "set" || Operator->getName() == "implicit")
return 0; // All return nothing.
@@ -2077,7 +2078,7 @@ void TreePatternNode::InlinePatternFragments(
return;
}
- Record *Op = getOperator();
+ const Record *Op = getOperator();
if (!Op->isSubClassOf("PatFrags")) {
if (getNumChildren() == 0) {
@@ -2340,7 +2341,7 @@ TreePatternNode::getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
/// return the ComplexPattern information, otherwise return null.
const ComplexPattern *
TreePatternNode::getComplexPatternInfo(const CodeGenDAGPatterns &CGP) const {
- Record *Rec;
+ const Record *Rec;
if (isLeaf()) {
DefInit *DI = dyn_cast<DefInit>(getLeafValue());
if (!DI)
@@ -2793,7 +2794,7 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
// TreePattern implementation
//
-TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
+TreePattern::TreePattern(const Record *TheRec, ListInit *RawPat, bool isInput,
CodeGenDAGPatterns &cdp)
: TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
Infer(*this) {
@@ -2801,15 +2802,15 @@ TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
Trees.push_back(ParseTreePattern(I, ""));
}
-TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
+TreePattern::TreePattern(const Record *TheRec, DagInit *Pat, bool isInput,
CodeGenDAGPatterns &cdp)
: TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
Infer(*this) {
Trees.push_back(ParseTreePattern(Pat, ""));
}
-TreePattern::TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput,
- CodeGenDAGPatterns &cdp)
+TreePattern::TreePattern(const Record *TheRec, TreePatternNodePtr Pat,
+ bool isInput, CodeGenDAGPatterns &cdp)
: TheRecord(TheRec), CDP(cdp), isInputPattern(isInput), HasError(false),
Infer(*this) {
Trees.push_back(Pat);
@@ -3389,7 +3390,7 @@ static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat,
return false;
}
- Record *Rec;
+ const Record *Rec;
if (Pat->isLeaf()) {
DefInit *DI = dyn_cast<DefInit>(Pat->getLeafValue());
if (!DI)
@@ -3408,7 +3409,7 @@ static bool HandleUse(TreePattern &I, TreePatternNodePtr Pat,
Slot = Pat;
return true;
}
- Record *SlotRec;
+ const Record *SlotRec;
if (Slot->isLeaf()) {
SlotRec = cast<DefInit>(Slot->getLeafValue())->getDef();
} else {
@@ -3633,7 +3634,8 @@ public:
};
static bool InferFromPattern(CodeGenInstruction &InstInfo,
- const InstAnalyzer &PatInfo, Record *PatDef) {
+ const InstAnalyzer &PatInfo,
+ const Record *PatDef) {
bool Error = false;
// Remember where InstInfo got its flags.
@@ -3729,7 +3731,7 @@ static bool hasNullFragReference(ListInit *LI) {
/// Get all the instructions in a tree.
static void getInstructionsInTree(TreePatternNode &Tree,
- SmallVectorImpl<Record *> &Instrs) {
+ SmallVectorImpl<const Record *> &Instrs) {
if (Tree.isLeaf())
return;
if (Tree.getOperator()->isSubClassOf("Instruction"))
@@ -3935,8 +3937,7 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
// Create and insert the instruction.
// FIXME: InstImpResults should not be part of DAGInstruction.
- Record *R = I.getRecord();
- DAGInsts.try_emplace(R, std::move(Results), std::move(Operands),
+ DAGInsts.try_emplace(I.getRecord(), std::move(Results), std::move(Operands),
std::move(InstImpResults), SrcPattern, ResultPattern);
LLVM_DEBUG(I.dump());
@@ -3989,9 +3990,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
}
// If we can, convert the instructions to be patterns that are matched!
- for (auto &Entry : Instructions) {
- Record *Instr = Entry.first;
- DAGInstruction &TheInst = Entry.second;
+ for (const auto &[Instr, TheInst] : Instructions) {
TreePatternNodePtr SrcPattern = TheInst.getSrcPattern();
TreePatternNodePtr ResultPattern = TheInst.getResultPattern();
@@ -4078,7 +4077,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
for (const PatternToMatch &PTM : ptms()) {
// We can only infer from single-instruction patterns, otherwise we won't
// know which instruction should get the flags.
- SmallVector<Record *, 8> PatInstrs;
+ SmallVector<const Record *, 8> PatInstrs;
getInstructionsInTree(PTM.getDstPattern(), PatInstrs);
if (PatInstrs.size() != 1)
continue;
@@ -4135,7 +4134,7 @@ void CodeGenDAGPatterns::InferInstructionFlags() {
void CodeGenDAGPatterns::VerifyInstructionFlags() {
unsigned Errors = 0;
for (const PatternToMatch &PTM : ptms()) {
- SmallVector<Record *, 8> Instrs;
+ SmallVector<const Record *, 8> Instrs;
getInstructionsInTree(PTM.getDstPattern(), Instrs);
if (Instrs.empty())
continue;
@@ -4245,7 +4244,7 @@ static TreePatternNodePtr PromoteXForms(TreePatternNodePtr N) {
}
void CodeGenDAGPatterns::ParseOnePattern(
- Record *TheDef, TreePattern &Pattern, TreePattern &Result,
+ const Record *TheDef, TreePattern &Pattern, TreePattern &Result,
const std::vector<Record *> &InstImpResults, bool ShouldIgnore) {
// Inline pattern fragments and expand multiple alternatives.
@@ -4591,7 +4590,7 @@ GatherChildrenOfAssociativeOpcode(TreePatternNodePtr N,
std::vector<TreePatternNodePtr> &Children) {
assert(N->getNumChildren() == 2 &&
"Associative but doesn't have 2 children!");
- Record *Operator = N->getOperator();
+ const Record *Operator = N->getOperator();
// Only permit raw nodes.
if (!N->getName().empty() || !N->getPredicateCalls().empty() ||
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
index 88a5437..4dc08e6 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h
@@ -634,7 +634,7 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> {
/// OperatorOrVal - The Record for the operator if this is an interior node
/// (not a leaf) or the init value (e.g. the "GPRC" record, or "7") for a
/// leaf.
- PointerUnion<Record *, Init *> OperatorOrVal;
+ PointerUnion<const Record *, Init *> OperatorOrVal;
/// Name - The name given to this node with the :$foo notation.
///
@@ -657,7 +657,7 @@ class TreePatternNode : public RefCountedBase<TreePatternNode> {
const Record *GISelFlags = nullptr;
public:
- TreePatternNode(Record *Op, std::vector<TreePatternNodePtr> Ch,
+ TreePatternNode(const Record *Op, std::vector<TreePatternNodePtr> Ch,
unsigned NumResults)
: OperatorOrVal(Op), TransformFn(nullptr), Children(std::move(Ch)) {
Types.resize(NumResults);
@@ -717,9 +717,9 @@ public:
assert(isLeaf());
return cast<Init *>(OperatorOrVal);
}
- Record *getOperator() const {
+ const Record *getOperator() const {
assert(!isLeaf());
- return cast<Record *>(OperatorOrVal);
+ return cast<const Record *>(OperatorOrVal);
}
unsigned getNumChildren() const { return Children.size(); }
@@ -878,7 +878,7 @@ class TreePattern {
/// TheRecord - The actual TableGen record corresponding to this pattern.
///
- Record *TheRecord;
+ const Record *TheRecord;
/// Args - This is a list of all of the arguments to this pattern (for
/// PatFrag patterns), which are the 'node' markers in this pattern.
@@ -908,11 +908,11 @@ class TreePattern {
public:
/// TreePattern constructor - Parse the specified DagInits into the
/// current record.
- TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
+ TreePattern(const Record *TheRec, ListInit *RawPat, bool isInput,
CodeGenDAGPatterns &ise);
- TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
+ TreePattern(const Record *TheRec, DagInit *Pat, bool isInput,
CodeGenDAGPatterns &ise);
- TreePattern(Record *TheRec, TreePatternNodePtr Pat, bool isInput,
+ TreePattern(const Record *TheRec, TreePatternNodePtr Pat, bool isInput,
CodeGenDAGPatterns &ise);
/// getTrees - Return the tree patterns which corresponds to this pattern.
@@ -935,7 +935,7 @@ public:
/// getRecord - Return the actual TableGen record corresponding to this
/// pattern.
///
- Record *getRecord() const { return TheRecord; }
+ const Record *getRecord() const { return TheRecord; }
unsigned getNumArgs() const { return Args.size(); }
const std::string &getArgName(unsigned i) const {
@@ -1054,7 +1054,7 @@ public:
/// PatternToMatch - Used by CodeGenDAGPatterns to keep tab of patterns
/// processed to produce isel.
class PatternToMatch {
- Record *SrcRecord; // Originating Record for the pattern.
+ const Record *SrcRecord; // Originating Record for the pattern.
ListInit *Predicates; // Top level predicate conditions to match.
TreePatternNodePtr SrcPattern; // Source pattern to match.
TreePatternNodePtr DstPattern; // Resulting pattern.
@@ -1065,16 +1065,16 @@ class PatternToMatch {
unsigned ID; // Unique ID for the record.
public:
- PatternToMatch(Record *srcrecord, ListInit *preds, TreePatternNodePtr src,
- TreePatternNodePtr dst, std::vector<Record *> dstregs,
- int complexity, unsigned uid, bool ignore,
- const Twine &hwmodefeatures = "")
+ PatternToMatch(const Record *srcrecord, ListInit *preds,
+ TreePatternNodePtr src, TreePatternNodePtr dst,
+ std::vector<Record *> dstregs, int complexity, unsigned uid,
+ bool ignore, const Twine &hwmodefeatures = "")
: SrcRecord(srcrecord), Predicates(preds), SrcPattern(src),
DstPattern(dst), Dstregs(std::move(dstregs)),
HwModeFeatures(hwmodefeatures.str()), AddedComplexity(complexity),
GISelShouldIgnore(ignore), ID(uid) {}
- Record *getSrcRecord() const { return SrcRecord; }
+ const Record *getSrcRecord() const { return SrcRecord; }
ListInit *getPredicates() const { return Predicates; }
TreePatternNode &getSrcPattern() const { return *SrcPattern; }
TreePatternNodePtr getSrcPatternShared() const { return SrcPattern; }
@@ -1099,14 +1099,14 @@ class CodeGenDAGPatterns {
CodeGenTarget Target;
CodeGenIntrinsicTable Intrinsics;
- std::map<Record *, SDNodeInfo, LessRecordByID> SDNodes;
- std::map<Record *, std::pair<Record *, std::string>, LessRecordByID>
+ std::map<const Record *, SDNodeInfo, LessRecordByID> SDNodes;
+ std::map<const Record *, std::pair<Record *, std::string>, LessRecordByID>
SDNodeXForms;
- std::map<Record *, ComplexPattern, LessRecordByID> ComplexPatterns;
- std::map<Record *, std::unique_ptr<TreePattern>, LessRecordByID>
+ std::map<const Record *, ComplexPattern, LessRecordByID> ComplexPatterns;
+ std::map<const Record *, std::unique_ptr<TreePattern>, LessRecordByID>
PatternFragments;
std::map<const Record *, DAGDefaultOperand, LessRecordByID> DefaultOperands;
- std::map<Record *, DAGInstruction, LessRecordByID> Instructions;
+ std::map<const Record *, DAGInstruction, LessRecordByID> Instructions;
// Specific SDNode definitions:
Record *intrinsic_void_sdnode;
@@ -1134,7 +1134,7 @@ public:
Record *getSDNodeNamed(StringRef Name) const;
- const SDNodeInfo &getSDNodeInfo(Record *R) const {
+ const SDNodeInfo &getSDNodeInfo(const Record *R) const {
auto F = SDNodes.find(R);
assert(F != SDNodes.end() && "Unknown node!");
return F->second;
@@ -1142,19 +1142,19 @@ public:
// Node transformation lookups.
typedef std::pair<Record *, std::string> NodeXForm;
- const NodeXForm &getSDNodeTransform(Record *R) const {
+ const NodeXForm &getSDNodeTransform(const Record *R) const {
auto F = SDNodeXForms.find(R);
assert(F != SDNodeXForms.end() && "Invalid transform!");
return F->second;
}
- const ComplexPattern &getComplexPattern(Record *R) const {
+ const ComplexPattern &getComplexPattern(const Record *R) const {
auto F = ComplexPatterns.find(R);
assert(F != ComplexPatterns.end() && "Unknown addressing mode!");
return F->second;
}
- const CodeGenIntrinsic &getIntrinsic(Record *R) const {
+ const CodeGenIntrinsic &getIntrinsic(const Record *R) const {
for (unsigned i = 0, e = Intrinsics.size(); i != e; ++i)
if (Intrinsics[i].TheDef == R)
return Intrinsics[i];
@@ -1181,20 +1181,19 @@ public:
}
// Pattern Fragment information.
- TreePattern *getPatternFragment(Record *R) const {
+ TreePattern *getPatternFragment(const Record *R) const {
auto F = PatternFragments.find(R);
assert(F != PatternFragments.end() && "Invalid pattern fragment request!");
return F->second.get();
}
- TreePattern *getPatternFragmentIfRead(Record *R) const {
+ TreePattern *getPatternFragmentIfRead(const Record *R) const {
auto F = PatternFragments.find(R);
if (F == PatternFragments.end())
return nullptr;
return F->second.get();
}
- typedef std::map<Record *, std::unique_ptr<TreePattern>,
- LessRecordByID>::const_iterator pf_iterator;
+ using pf_iterator = decltype(PatternFragments)::const_iterator;
pf_iterator pf_begin() const { return PatternFragments.begin(); }
pf_iterator pf_end() const { return PatternFragments.end(); }
iterator_range<pf_iterator> ptfs() const { return PatternFragments; }
@@ -1206,11 +1205,11 @@ public:
iterator_range<ptm_iterator> ptms() const { return PatternsToMatch; }
/// Parse the Pattern for an instruction, and insert the result in DAGInsts.
- typedef std::map<Record *, DAGInstruction, LessRecordByID> DAGInstMap;
+ typedef std::map<const Record *, DAGInstruction, LessRecordByID> DAGInstMap;
void parseInstructionPattern(CodeGenInstruction &CGI, ListInit *Pattern,
DAGInstMap &DAGInsts);
- const DAGInstruction &getInstruction(Record *R) const {
+ const DAGInstruction &getInstruction(const Record *R) const {
auto F = Instructions.find(R);
assert(F != Instructions.end() && "Unknown instruction!");
return F->second;
@@ -1244,7 +1243,7 @@ private:
void GenerateVariants();
void VerifyInstructionFlags();
- void ParseOnePattern(Record *TheDef, TreePattern &Pattern,
+ void ParseOnePattern(const Record *TheDef, TreePattern &Pattern,
TreePattern &Result,
const std::vector<Record *> &InstImpResults,
bool ShouldIgnore = false);
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
index 1cc217b..8d698fa 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.cpp
@@ -298,7 +298,8 @@ CGIOperandList::ParseOperandName(StringRef Op, bool AllowWholeOp) {
return std::pair(0U, 0U);
}
-static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
+static void ParseConstraint(StringRef CStr, CGIOperandList &Ops,
+ const Record *Rec) {
// EARLY_CLOBBER: @early $reg
StringRef::size_type wpos = CStr.find_first_of(" \t");
StringRef::size_type start = CStr.find_first_not_of(" \t");
@@ -391,7 +392,8 @@ static void ParseConstraint(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
Ops[SrcOp.first].Constraints[SrcOp.second] = NewConstraint;
}
-static void ParseConstraints(StringRef CStr, CGIOperandList &Ops, Record *Rec) {
+static void ParseConstraints(StringRef CStr, CGIOperandList &Ops,
+ const Record *Rec) {
if (CStr.empty())
return;
@@ -428,7 +430,7 @@ void CGIOperandList::ProcessDisableEncoding(StringRef DisableEncoding) {
// CodeGenInstruction Implementation
//===----------------------------------------------------------------------===//
-CodeGenInstruction::CodeGenInstruction(Record *R)
+CodeGenInstruction::CodeGenInstruction(const Record *R)
: TheDef(R), Operands(R), InferredFrom(nullptr) {
Namespace = R->getValueAsString("Namespace");
AsmString = std::string(R->getValueAsString("AsmString"));
@@ -501,7 +503,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R)
HasComplexDeprecationPredicate = true;
DeprecatedReason =
std::string(R->getValueAsString("ComplexDeprecationPredicate"));
- } else if (RecordVal *Dep = R->getValue("DeprecatedFeatureMask")) {
+ } else if (const RecordVal *Dep = R->getValue("DeprecatedFeatureMask")) {
// Check if we have a Subtarget feature mask.
HasComplexDeprecationPredicate = false;
DeprecatedReason = Dep->getValue()->getAsString();
diff --git a/llvm/utils/TableGen/Common/CodeGenInstruction.h b/llvm/utils/TableGen/Common/CodeGenInstruction.h
index f4af0e8..3d4360f 100644
--- a/llvm/utils/TableGen/Common/CodeGenInstruction.h
+++ b/llvm/utils/TableGen/Common/CodeGenInstruction.h
@@ -222,8 +222,8 @@ public:
class CodeGenInstruction {
public:
- Record *TheDef; // The actual record defining this instruction.
- StringRef Namespace; // The namespace the instruction is in.
+ const Record *TheDef; // The actual record defining this instruction.
+ StringRef Namespace; // The namespace the instruction is in.
/// AsmString - The format string used to emit a .s file for the
/// instruction.
@@ -297,12 +297,12 @@ public:
// The record used to infer instruction flags, or NULL if no flag values
// have been inferred.
- Record *InferredFrom;
+ const Record *InferredFrom;
// The enum value assigned by CodeGenTarget::computeInstrsByEnum.
mutable unsigned EnumVal = 0;
- CodeGenInstruction(Record *R);
+ CodeGenInstruction(const Record *R);
/// HasOneImplicitDefWithKnownVT - If the instruction has at least one
/// implicit def and it has a known VT, return the VT, otherwise return
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
index cadc242..5b43f7d 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp
@@ -34,7 +34,6 @@
#include <iterator>
#include <map>
#include <queue>
-#include <set>
#include <string>
#include <tuple>
#include <utility>
@@ -48,7 +47,7 @@ using namespace llvm;
// CodeGenSubRegIndex
//===----------------------------------------------------------------------===//
-CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum,
+CodeGenSubRegIndex::CodeGenSubRegIndex(const Record *R, unsigned Enum,
const CodeGenHwModes &CGH)
: TheDef(R), EnumValue(Enum), AllSuperRegsCovered(true), Artificial(true) {
Name = std::string(R->getName());
@@ -99,7 +98,7 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
PrintFatalError(TheDef->getLoc(),
"CoveredBySubRegs must have two or more entries");
SmallVector<CodeGenSubRegIndex *, 8> IdxParts;
- for (Record *Part : Parts)
+ for (const Record *Part : Parts)
IdxParts.push_back(RegBank.getSubRegIdx(Part));
setConcatenationOf(IdxParts);
}
@@ -190,8 +189,7 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
// Add ad hoc alias links. This is a symmetric relationship between two
// registers, so build a symmetric graph by adding links in both ends.
- std::vector<Record *> Aliases = TheDef->getValueAsListOfDefs("Aliases");
- for (Record *Alias : Aliases) {
+ for (const Record *Alias : TheDef->getValueAsListOfDefs("Aliases")) {
CodeGenRegister *Reg = RegBank.getReg(Alias);
ExplicitAliases.push_back(Reg);
Reg->ExplicitAliases.push_back(this);
@@ -757,7 +755,8 @@ static void sortAndUniqueRegisters(CodeGenRegister::Vec &M) {
M.erase(llvm::unique(M, deref<std::equal_to<>>()), M.end());
}
-CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
+CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank,
+ const Record *R)
: TheDef(R), Name(std::string(R->getName())),
TopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), TSFlags(0) {
GeneratePressureSet = R->getValueAsBit("GeneratePressureSet");
@@ -765,7 +764,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
if (TypeList.empty())
PrintFatalError(R->getLoc(), "RegTypes list must not be empty!");
for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
- Record *Type = TypeList[i];
+ const Record *Type = TypeList[i];
if (!Type->isSubClassOf("ValueType"))
PrintFatalError(R->getLoc(),
"RegTypes list member '" + Type->getName() +
@@ -1168,9 +1167,9 @@ void CodeGenRegisterClass::buildRegUnitSet(
//===----------------------------------------------------------------------===//
CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
- Record *R)
+ const Record *R)
: TheDef(R), Name(std::string(R->getName())) {
- for (Record *RegClass : R->getValueAsListOfDefs("Classes"))
+ for (const Record *RegClass : R->getValueAsListOfDefs("Classes"))
Classes.push_back(RegBank.getRegClass(RegClass));
}
@@ -1178,7 +1177,7 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank,
// CodeGenRegBank
//===----------------------------------------------------------------------===//
-CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
+CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records,
const CodeGenHwModes &Modes)
: CGH(Modes) {
// Configure register Sets to understand register classes and tuples.
@@ -1189,10 +1188,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
// Read in the user-defined (named) sub-register indices.
// More indices will be synthesized later.
- std::vector<Record *> SRIs = Records.getAllDerivedDefinitions("SubRegIndex");
- llvm::sort(SRIs, LessRecord());
- for (unsigned i = 0, e = SRIs.size(); i != e; ++i)
- getSubRegIdx(SRIs[i]);
+ for (const Record *SRI : Records.getAllDerivedDefinitions("SubRegIndex"))
+ getSubRegIdx(SRI);
// Build composite maps from ComposedOf fields.
for (auto &Idx : SubRegIndices)
Idx.updateComponents(*this);
@@ -1223,7 +1220,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
getReg(Regs[i]);
// Expand tuples and number the new registers.
- for (Record *R : Records.getAllDerivedDefinitions("RegisterTuples")) {
+ for (const Record *R : Records.getAllDerivedDefinitions("RegisterTuples")) {
std::vector<const Record *> TupRegs = *Sets.expand(R);
llvm::sort(TupRegs, LessRecordRegister());
for (const Record *RC : TupRegs)
@@ -1288,7 +1285,8 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
NumNativeRegUnits = RegUnits.size();
// Read in register class definitions.
- std::vector<Record *> RCs = Records.getAllDerivedDefinitions("RegisterClass");
+ ArrayRef<const Record *> RCs =
+ Records.getAllDerivedDefinitions("RegisterClass");
if (RCs.empty())
PrintFatalError("No 'RegisterClass' subclasses defined!");
@@ -1311,9 +1309,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
CodeGenRegisterClass::computeSubClasses(*this);
// Read in the register category definitions.
- std::vector<Record *> RCats =
- Records.getAllDerivedDefinitions("RegisterCategory");
- for (auto *R : RCats)
+ for (const Record *R : Records.getAllDerivedDefinitions("RegisterCategory"))
RegCategories.emplace_back(*this, R);
}
@@ -1324,7 +1320,7 @@ CodeGenSubRegIndex *CodeGenRegBank::createSubRegIndex(StringRef Name,
return &SubRegIndices.back();
}
-CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(Record *Def) {
+CodeGenSubRegIndex *CodeGenRegBank::getSubRegIdx(const Record *Def) {
CodeGenSubRegIndex *&Idx = Def2SubRegIdx[Def];
if (Idx)
return Idx;
@@ -2450,7 +2446,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() {
/// return null. If the register is in multiple classes, and the classes have a
/// superset-subset relationship and the same set of types, return the
/// superclass. Otherwise return null.
-const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) {
+const CodeGenRegisterClass *
+CodeGenRegBank::getRegClassForRegister(const Record *R) {
const CodeGenRegister *Reg = getReg(R);
const CodeGenRegisterClass *FoundRC = nullptr;
for (const auto &RC : getRegClasses()) {
@@ -2490,7 +2487,7 @@ const CodeGenRegisterClass *CodeGenRegBank::getRegClassForRegister(Record *R) {
}
const CodeGenRegisterClass *
-CodeGenRegBank::getMinimalPhysRegClass(Record *RegRecord,
+CodeGenRegBank::getMinimalPhysRegClass(const Record *RegRecord,
ValueTypeByHwMode *VT) {
const CodeGenRegister *Reg = getReg(RegRecord);
const CodeGenRegisterClass *BestRC = nullptr;
diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h
index f0f53d3..2fa6cab 100644
--- a/llvm/utils/TableGen/Common/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h
@@ -63,7 +63,7 @@ struct MaskRolPair {
/// CodeGenSubRegIndex - Represents a sub-register index.
class CodeGenSubRegIndex {
- Record *const TheDef;
+ const Record *const TheDef;
std::string Name;
std::string Namespace;
@@ -85,7 +85,7 @@ public:
// indexes are not used to create new register classes.
bool Artificial;
- CodeGenSubRegIndex(Record *R, unsigned Enum, const CodeGenHwModes &CGH);
+ CodeGenSubRegIndex(const Record *R, unsigned Enum, const CodeGenHwModes &CGH);
CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
CodeGenSubRegIndex(CodeGenSubRegIndex &) = delete;
@@ -320,7 +320,7 @@ class CodeGenRegisterClass {
// List of super-classes, topologocally ordered to have the larger classes
// first. This is the same as sorting by EnumValue.
SmallVector<CodeGenRegisterClass *, 4> SuperClasses;
- Record *TheDef;
+ const Record *TheDef;
std::string Name;
// For a synthesized class, inherit missing properties from the nearest
@@ -368,7 +368,7 @@ public:
// Return the Record that defined this class, or NULL if the class was
// created by TableGen.
- Record *getDef() const { return TheDef; }
+ const Record *getDef() const { return TheDef; }
std::string getNamespaceQualification() const;
const std::string &getName() const { return Name; }
@@ -473,7 +473,7 @@ public:
void buildRegUnitSet(const CodeGenRegBank &RegBank,
std::vector<unsigned> &RegUnits) const;
- CodeGenRegisterClass(CodeGenRegBank &, Record *R);
+ CodeGenRegisterClass(CodeGenRegBank &, const Record *R);
CodeGenRegisterClass(CodeGenRegisterClass &) = delete;
// A key representing the parts of a register class used for forming
@@ -511,17 +511,17 @@ public:
// register falls into (GPR, vector, fixed, etc.) without having to know
// specific information about the target architecture.
class CodeGenRegisterCategory {
- Record *TheDef;
+ const Record *TheDef;
std::string Name;
std::list<CodeGenRegisterClass *> Classes;
public:
- CodeGenRegisterCategory(CodeGenRegBank &, Record *R);
+ CodeGenRegisterCategory(CodeGenRegBank &, const Record *R);
CodeGenRegisterCategory(CodeGenRegisterCategory &) = delete;
// Return the Record that defined this class, or NULL if the class was
// created by TableGen.
- Record *getDef() const { return TheDef; }
+ const Record *getDef() const { return TheDef; }
std::string getName() const { return Name; }
std::list<CodeGenRegisterClass *> getClasses() const { return Classes; }
@@ -585,7 +585,7 @@ class CodeGenRegBank {
const CodeGenHwModes &CGH;
std::deque<CodeGenSubRegIndex> SubRegIndices;
- DenseMap<Record *, CodeGenSubRegIndex *> Def2SubRegIdx;
+ DenseMap<const Record *, CodeGenSubRegIndex *> Def2SubRegIdx;
CodeGenSubRegIndex *createSubRegIndex(StringRef Name, StringRef NameSpace);
@@ -612,7 +612,6 @@ class CodeGenRegBank {
// Register categories.
std::list<CodeGenRegisterCategory> RegCategories;
- DenseMap<Record *, CodeGenRegisterCategory *> Def2RCat;
using RCatKeyMap =
std::map<CodeGenRegisterClass::Key, CodeGenRegisterCategory *>;
RCatKeyMap Key2RCat;
@@ -677,7 +676,7 @@ class CodeGenRegBank {
void computeRegUnitLaneMasks();
public:
- CodeGenRegBank(RecordKeeper &, const CodeGenHwModes &);
+ CodeGenRegBank(const RecordKeeper &, const CodeGenHwModes &);
CodeGenRegBank(CodeGenRegBank &) = delete;
SetTheory &getSets() { return Sets; }
@@ -693,7 +692,7 @@ public:
// Find a SubRegIndex from its Record def or add to the list if it does
// not exist there yet.
- CodeGenSubRegIndex *getSubRegIdx(Record *);
+ CodeGenSubRegIndex *getSubRegIdx(const Record *);
// Find a SubRegIndex from its Record def.
const CodeGenSubRegIndex *findSubRegIdx(const Record *Def) const;
@@ -785,14 +784,15 @@ public:
/// class, return null. If the register is in multiple classes, and the
/// classes have a superset-subset relationship and the same set of types,
/// return the superclass. Otherwise return null.
- const CodeGenRegisterClass *getRegClassForRegister(Record *R);
+ const CodeGenRegisterClass *getRegClassForRegister(const Record *R);
// Analog of TargetRegisterInfo::getMinimalPhysRegClass. Unlike
// getRegClassForRegister, this tries to find the smallest class containing
// the physical register. If \p VT is specified, it will only find classes
// with a matching type
const CodeGenRegisterClass *
- getMinimalPhysRegClass(Record *RegRecord, ValueTypeByHwMode *VT = nullptr);
+ getMinimalPhysRegClass(const Record *RegRecord,
+ ValueTypeByHwMode *VT = nullptr);
// Get the sum of unit weights.
unsigned getRegUnitSetWeight(const std::vector<unsigned> &Units) const {
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
index 0a48fea..3dcfdc9 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp
@@ -609,7 +609,7 @@ void CodeGenSchedModels::collectSchedRW() {
// Find all SchedReadWrites referenced by instruction defs.
RecVec SWDefs, SRDefs;
for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
- Record *SchedDef = Inst->TheDef;
+ const Record *SchedDef = Inst->TheDef;
if (SchedDef->isValueUnset("SchedRW"))
continue;
RecVec RWs = SchedDef->getValueAsListOfDefs("SchedRW");
diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.h b/llvm/utils/TableGen/Common/DAGISelMatcher.h
index 81a5e3e..49dc472 100644
--- a/llvm/utils/TableGen/Common/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/Common/DAGISelMatcher.h
@@ -983,14 +983,14 @@ private:
/// recorded node and records the result.
class EmitNodeXFormMatcher : public Matcher {
unsigned Slot;
- Record *NodeXForm;
+ const Record *NodeXForm;
public:
- EmitNodeXFormMatcher(unsigned slot, Record *nodeXForm)
+ EmitNodeXFormMatcher(unsigned slot, const Record *nodeXForm)
: Matcher(EmitNodeXForm), Slot(slot), NodeXForm(nodeXForm) {}
unsigned getSlot() const { return Slot; }
- Record *getNodeXForm() const { return NodeXForm; }
+ const Record *getNodeXForm() const { return NodeXForm; }
static bool classof(const Matcher *N) {
return N->getKind() == EmitNodeXForm;
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
index 139bf2d..0779b1e 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp
@@ -822,7 +822,7 @@ SaveAndRestore<GISelFlags> RuleMatcher::setGISelFlags(const Record *R) {
}
Error RuleMatcher::defineComplexSubOperand(StringRef SymbolicName,
- Record *ComplexPattern,
+ const Record *ComplexPattern,
unsigned RendererID,
unsigned SubOperandID,
StringRef ParentSymbolicName) {
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index 5b517b1..94f26d8 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -523,7 +523,7 @@ protected:
ArrayRef<SMLoc> SrcLoc;
- typedef std::tuple<Record *, unsigned, unsigned>
+ typedef std::tuple<const Record *, unsigned, unsigned>
DefinedComplexPatternSubOperand;
typedef StringMap<DefinedComplexPatternSubOperand>
DefinedComplexPatternSubOperandMap;
@@ -649,7 +649,8 @@ public:
void definePhysRegOperand(Record *Reg, OperandMatcher &OM);
- Error defineComplexSubOperand(StringRef SymbolicName, Record *ComplexPattern,
+ Error defineComplexSubOperand(StringRef SymbolicName,
+ const Record *ComplexPattern,
unsigned RendererID, unsigned SubOperandID,
StringRef ParentSymbolicName);
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.cpp b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
index 7425e7f..e5e8225 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.cpp
@@ -28,7 +28,8 @@ std::string llvm::getModeName(unsigned Mode) {
return (Twine('m') + Twine(Mode)).str();
}
-ValueTypeByHwMode::ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH) {
+ValueTypeByHwMode::ValueTypeByHwMode(const Record *R,
+ const CodeGenHwModes &CGH) {
const HwModeSelect &MS = CGH.getHwModeSelect(R);
for (const HwModeSelect::PairType &P : MS.Items) {
auto I = Map.insert({P.first, MVT(llvm::getValueType(P.second))});
@@ -39,7 +40,8 @@ ValueTypeByHwMode::ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH) {
PtrAddrSpace = R->getValueAsInt("AddrSpace");
}
-ValueTypeByHwMode::ValueTypeByHwMode(Record *R, MVT T) : ValueTypeByHwMode(T) {
+ValueTypeByHwMode::ValueTypeByHwMode(const Record *R, MVT T)
+ : ValueTypeByHwMode(T) {
if (R->isSubClassOf("PtrValueType"))
PtrAddrSpace = R->getValueAsInt("AddrSpace");
}
@@ -102,7 +104,7 @@ void ValueTypeByHwMode::writeToStream(raw_ostream &OS) const {
LLVM_DUMP_METHOD
void ValueTypeByHwMode::dump() const { dbgs() << *this << '\n'; }
-ValueTypeByHwMode llvm::getValueTypeByHwMode(Record *Rec,
+ValueTypeByHwMode llvm::getValueTypeByHwMode(const Record *Rec,
const CodeGenHwModes &CGH) {
#ifndef NDEBUG
if (!Rec->isSubClassOf("ValueType"))
diff --git a/llvm/utils/TableGen/Common/InfoByHwMode.h b/llvm/utils/TableGen/Common/InfoByHwMode.h
index a6edf3c..4f11e8e 100644
--- a/llvm/utils/TableGen/Common/InfoByHwMode.h
+++ b/llvm/utils/TableGen/Common/InfoByHwMode.h
@@ -152,8 +152,8 @@ protected:
};
struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
- ValueTypeByHwMode(Record *R, const CodeGenHwModes &CGH);
- ValueTypeByHwMode(Record *R, MVT T);
+ ValueTypeByHwMode(const Record *R, const CodeGenHwModes &CGH);
+ ValueTypeByHwMode(const Record *R, MVT T);
ValueTypeByHwMode(MVT T) { Map.insert({DefaultMode, T}); }
ValueTypeByHwMode() = default;
@@ -174,7 +174,8 @@ struct ValueTypeByHwMode : public InfoByHwMode<MVT> {
}
};
-ValueTypeByHwMode getValueTypeByHwMode(Record *Rec, const CodeGenHwModes &CGH);
+ValueTypeByHwMode getValueTypeByHwMode(const Record *Rec,
+ const CodeGenHwModes &CGH);
raw_ostream &operator<<(raw_ostream &OS, const ValueTypeByHwMode &T);
diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
index a4d6d8d..738ddf7 100644
--- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.cpp
@@ -20,12 +20,10 @@ LLVM_DUMP_METHOD void SubtargetFeatureInfo::dump() const {
}
#endif
-std::vector<std::pair<Record *, SubtargetFeatureInfo>>
-SubtargetFeatureInfo::getAll(RecordKeeper &Records) {
- std::vector<std::pair<Record *, SubtargetFeatureInfo>> SubtargetFeatures;
- std::vector<Record *> AllPredicates =
- Records.getAllDerivedDefinitions("Predicate");
- for (Record *Pred : AllPredicates) {
+SubtargetFeaturesInfoVec
+SubtargetFeatureInfo::getAll(const RecordKeeper &Records) {
+ SubtargetFeaturesInfoVec SubtargetFeatures;
+ for (const Record *Pred : Records.getAllDerivedDefinitions("Predicate")) {
// Ignore predicates that are not intended for the assembler.
//
// The "AssemblerMatcherPredicate" string should be promoted to an argument
diff --git a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
index fee2c02..d75a9a4 100644
--- a/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
+++ b/llvm/utils/TableGen/Common/SubtargetFeatureInfo.h
@@ -19,18 +19,20 @@
namespace llvm {
struct SubtargetFeatureInfo;
using SubtargetFeatureInfoMap =
- std::map<Record *, SubtargetFeatureInfo, LessRecordByID>;
+ std::map<const Record *, SubtargetFeatureInfo, LessRecordByID>;
+using SubtargetFeaturesInfoVec =
+ std::vector<std::pair<const Record *, SubtargetFeatureInfo>>;
/// Helper class for storing information on a subtarget feature which
/// participates in instruction matching.
struct SubtargetFeatureInfo {
/// The predicate record for this feature.
- Record *TheDef;
+ const Record *TheDef;
/// An unique index assigned to represent this feature.
uint64_t Index;
- SubtargetFeatureInfo(Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {}
+ SubtargetFeatureInfo(const Record *D, uint64_t Idx) : TheDef(D), Index(Idx) {}
/// The name of the enumerated constant identifying this feature.
std::string getEnumName() const {
@@ -48,8 +50,8 @@ struct SubtargetFeatureInfo {
}
void dump() const;
- static std::vector<std::pair<Record *, SubtargetFeatureInfo>>
- getAll(RecordKeeper &Records);
+
+ static SubtargetFeaturesInfoVec getAll(const RecordKeeper &Records);
/// Emit the subtarget feature flag definitions.
///
diff --git a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
index 049bd37..ce4cd35 100644
--- a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp
@@ -77,7 +77,7 @@ class VarLenCodeEmitterGen {
// name suffix to improve readability of the generated code.
std::map<AltEncodingTy, std::string> Modes;
- DenseMap<Record *, DenseMap<AltEncodingTy, VarLenInst>> VarLenInsts;
+ DenseMap<const Record *, DenseMap<AltEncodingTy, VarLenInst>> VarLenInsts;
// Emit based values (i.e. fixed bits in the encoded instructions)
void emitInstructionBaseValues(
@@ -227,7 +227,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) {
auto NumberedInstructions = Target.getInstructionsByEnumValue();
for (const CodeGenInstruction *CGI : NumberedInstructions) {
- Record *R = CGI->TheDef;
+ const Record *R = CGI->TheDef;
// Create the corresponding VarLenInst instance.
if (R->getValueAsString("Namespace") == "TargetOpcode" ||
R->getValueAsBit("isPseudo"))
@@ -249,7 +249,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) {
continue;
}
}
- RecordVal *RV = R->getValue("Inst");
+ const RecordVal *RV = R->getValue("Inst");
DagInit *DI = cast<DagInit>(RV->getValue());
VarLenInsts[R].insert({Universal, VarLenInst(DI, RV)});
}
@@ -356,7 +356,7 @@ void VarLenCodeEmitterGen::emitInstructionBaseValues(
unsigned NumFixedValueWords = 0U;
for (const CodeGenInstruction *CGI : NumberedInstructions) {
- Record *R = CGI->TheDef;
+ const Record *R = CGI->TheDef;
if (R->getValueAsString("Namespace") == "TargetOpcode" ||
R->getValueAsBit("isPseudo")) {
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index b43a8e6..6c72103 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -47,7 +47,7 @@ static unsigned getResultPatternCost(TreePatternNode &P,
return 0;
unsigned Cost = 0;
- Record *Op = P.getOperator();
+ const Record *Op = P.getOperator();
if (Op->isSubClassOf("Instruction")) {
Cost++;
CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
@@ -67,7 +67,7 @@ static unsigned getResultPatternSize(TreePatternNode &P,
return 0;
unsigned Cost = 0;
- Record *Op = P.getOperator();
+ const Record *Op = P.getOperator();
if (Op->isSubClassOf("Instruction")) {
Cost += Op->getValueAsInt("CodeSize");
}
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index a14cc3d..96a40f0 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -64,8 +64,8 @@ class MatcherTableEmitter {
std::vector<const ComplexPattern *> ComplexPatterns;
- DenseMap<Record *, unsigned> NodeXFormMap;
- std::vector<Record *> NodeXForms;
+ DenseMap<const Record *, unsigned> NodeXFormMap;
+ std::vector<const Record *> NodeXForms;
std::vector<std::string> VecIncludeStrings;
MapVector<std::string, unsigned, StringMap<unsigned>> VecPatterns;
@@ -203,7 +203,7 @@ private:
return llvm::find(ComplexPatterns, &P) - ComplexPatterns.begin();
}
- unsigned getNodeXFormID(Record *Rec) {
+ unsigned getNodeXFormID(const Record *Rec) {
unsigned &Entry = NodeXFormMap[Rec];
if (Entry == 0) {
NodeXForms.push_back(Rec);
@@ -930,7 +930,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
GetPatFromTreePatternNode(SNT->getPattern().getSrcPattern());
std::string dst =
GetPatFromTreePatternNode(SNT->getPattern().getDstPattern());
- Record *PatRecord = SNT->getPattern().getSrcRecord();
+ const Record *PatRecord = SNT->getPattern().getSrcRecord();
std::string include_src = getIncludePath(PatRecord);
unsigned Offset =
getPatternIdxFromTable(src + " -> " + dst, std::move(include_src));
@@ -1043,7 +1043,7 @@ unsigned MatcherTableEmitter::EmitMatcher(const Matcher *N,
GetPatFromTreePatternNode(CM->getPattern().getSrcPattern());
std::string dst =
GetPatFromTreePatternNode(CM->getPattern().getDstPattern());
- Record *PatRecord = CM->getPattern().getSrcRecord();
+ const Record *PatRecord = CM->getPattern().getSrcRecord();
std::string include_src = getIncludePath(PatRecord);
unsigned Offset =
getPatternIdxFromTable(src + " -> " + dst, std::move(include_src));
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 4e65690..bb8f4dc 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -746,7 +746,7 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode &N,
static bool mayInstNodeLoadOrStore(const TreePatternNode &N,
const CodeGenDAGPatterns &CGP) {
- Record *Op = N.getOperator();
+ const Record *Op = N.getOperator();
const CodeGenTarget &CGT = CGP.getTargetInfo();
CodeGenInstruction &II = CGT.getInstruction(Op);
return II.mayLoad || II.mayStore;
@@ -757,7 +757,7 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N,
if (N.isLeaf())
return 0;
- Record *OpRec = N.getOperator();
+ const Record *OpRec = N.getOperator();
if (!OpRec->isSubClassOf("Instruction"))
return 0;
@@ -773,7 +773,7 @@ static unsigned numNodesThatMayLoadOrStore(const TreePatternNode &N,
void MatcherGen::EmitResultInstructionAsOperand(
const TreePatternNode &N, SmallVectorImpl<unsigned> &OutputOps) {
- Record *Op = N.getOperator();
+ const Record *Op = N.getOperator();
const CodeGenTarget &CGT = CGP.getTargetInfo();
CodeGenInstruction &II = CGT.getInstruction(Op);
const DAGInstruction &Inst = CGP.getInstruction(Op);
@@ -1010,7 +1010,7 @@ void MatcherGen::EmitResultOperand(const TreePatternNode &N,
if (N.isLeaf())
return EmitResultLeafAsOperand(N, ResultOps);
- Record *OpRec = N.getOperator();
+ const Record *OpRec = N.getOperator();
if (OpRec->isSubClassOf("Instruction"))
return EmitResultInstructionAsOperand(N, ResultOps);
if (OpRec->isSubClassOf("SDNodeXForm"))
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index 2ef98b3..01df873 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -236,7 +236,7 @@ struct OperandsSignature {
// not needed and just bloat the fast instruction selector. For
// example, X86 doesn't need to generate code to match ADD16ri8 since
// ADD16ri will do just fine.
- Record *Rec = PredFn.getOrigPatFragRecord()->getRecord();
+ const Record *Rec = PredFn.getOrigPatFragRecord()->getRecord();
if (Rec->getValueAsBit("FastIselShouldIgnore"))
return false;
@@ -417,7 +417,7 @@ private:
};
} // End anonymous namespace
-static std::string getOpcodeName(Record *Op, CodeGenDAGPatterns &CGP) {
+static std::string getOpcodeName(const Record *Op, CodeGenDAGPatterns &CGP) {
return std::string(CGP.getSDNodeInfo(Op).getEnumName());
}
@@ -461,7 +461,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
TreePatternNode &Dst = Pattern.getDstPattern();
if (Dst.isLeaf())
continue;
- Record *Op = Dst.getOperator();
+ const Record *Op = Dst.getOperator();
if (!Op->isSubClassOf("Instruction"))
continue;
CodeGenInstruction &II = CGP.getTargetInfo().getInstruction(Op);
@@ -524,7 +524,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
if (InstPatNode.getNumTypes() > 1)
continue;
- Record *InstPatOp = InstPatNode.getOperator();
+ const Record *InstPatOp = InstPatNode.getOperator();
std::string OpcodeName = getOpcodeName(InstPatOp, CGP);
MVT::SimpleValueType RetVT = MVT::isVoid;
if (InstPatNode.getNumTypes())
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index b2f4d32..d82f1c3 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -120,7 +120,7 @@ static std::string explainPredicates(const TreePatternNode &N) {
if (P.isTruncStore())
Explanation += " truncstore";
- if (Record *VT = P.getMemoryVT())
+ if (const Record *VT = P.getMemoryVT())
Explanation += (" MemVT=" + VT->getName()).str();
if (Record *VT = P.getScalarMemoryVT())
Explanation += (" ScalarVT(MemVT)=" + VT->getName()).str();
@@ -168,7 +168,7 @@ static std::string explainPredicates(const TreePatternNode &N) {
return Explanation;
}
-std::string explainOperator(Record *Operator) {
+std::string explainOperator(const Record *Operator) {
if (Operator->isSubClassOf("SDNode"))
return (" (" + Operator->getValueAsString("Opcode") + ")").str();
@@ -346,7 +346,7 @@ private:
/// SDNodes to the GINodeEquiv mapping. We need to map to the GINodeEquiv to
/// check for attributes on the relation such as CheckMMOIsNonAtomic.
/// This is defined using 'GINodeEquiv' in the target description.
- DenseMap<Record *, Record *> NodeEquivs;
+ DenseMap<const Record *, Record *> NodeEquivs;
/// Keep track of the equivalence between ComplexPattern's and
/// GIComplexOperandMatcher. Map entries are specified by subclassing
@@ -379,7 +379,7 @@ private:
void gatherTypeIDValues();
void gatherNodeEquivs();
- Record *findNodeEquiv(Record *N) const;
+ Record *findNodeEquiv(const Record *N) const;
const CodeGenInstruction *getEquivNode(Record &Equiv,
const TreePatternNode &N) const;
@@ -388,7 +388,7 @@ private:
createAndImportSelDAGMatcher(RuleMatcher &Rule,
InstructionMatcher &InsnMatcher,
const TreePatternNode &Src, unsigned &TempOpIdx);
- Error importComplexPatternOperandMatcher(OperandMatcher &OM, Record *R,
+ Error importComplexPatternOperandMatcher(OperandMatcher &OM, const Record *R,
unsigned &TempOpIdx) const;
Error importChildMatcher(RuleMatcher &Rule, InstructionMatcher &InsnMatcher,
const TreePatternNode &SrcChild,
@@ -504,7 +504,7 @@ void GlobalISelEmitter::gatherNodeEquivs() {
}
}
-Record *GlobalISelEmitter::findNodeEquiv(Record *N) const {
+Record *GlobalISelEmitter::findNodeEquiv(const Record *N) const {
return NodeEquivs.lookup(N);
}
@@ -928,7 +928,7 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
}
Error GlobalISelEmitter::importComplexPatternOperandMatcher(
- OperandMatcher &OM, Record *R, unsigned &TempOpIdx) const {
+ OperandMatcher &OM, const Record *R, unsigned &TempOpIdx) const {
const auto &ComplexPattern = ComplexPatternEquivs.find(R);
if (ComplexPattern == ComplexPatternEquivs.end())
return failedImport("SelectionDAG ComplexPattern (" + R->getName() +
@@ -1508,7 +1508,7 @@ GlobalISelEmitter::createAndImportSubInstructionRenderer(
Expected<action_iterator> GlobalISelEmitter::createInstructionRenderer(
action_iterator InsertPt, RuleMatcher &M, const TreePatternNode &Dst) {
- Record *DstOp = Dst.getOperator();
+ const Record *DstOp = Dst.getOperator();
if (!DstOp->isSubClassOf("Instruction")) {
if (DstOp->isSubClassOf("ValueType"))
return failedImport(
@@ -1813,7 +1813,7 @@ GlobalISelEmitter::inferRegClassFromPattern(const TreePatternNode &N) {
// just take the first one).
if (N.getNumTypes() < 1)
return std::nullopt;
- Record *OpRec = N.getOperator();
+ const Record *OpRec = N.getOperator();
// We only want instructions.
if (!OpRec->isSubClassOf("Instruction"))
@@ -2011,7 +2011,7 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
}
// Start with the defined operands (i.e., the results of the root operator).
- Record *DstOp = Dst.getOperator();
+ const Record *DstOp = Dst.getOperator();
if (!DstOp->isSubClassOf("Instruction"))
return failedImport("Pattern operator isn't an instruction");
diff --git a/llvm/utils/TableGen/InstrDocsEmitter.cpp b/llvm/utils/TableGen/InstrDocsEmitter.cpp
index f948540..f53428e 100644
--- a/llvm/utils/TableGen/InstrDocsEmitter.cpp
+++ b/llvm/utils/TableGen/InstrDocsEmitter.cpp
@@ -73,7 +73,7 @@ static void EmitInstrDocs(RecordKeeper &RK, raw_ostream &OS) {
OS << "\n";
for (const CodeGenInstruction *II : Target.getInstructionsByEnumValue()) {
- Record *Inst = II->TheDef;
+ const Record *Inst = II->TheDef;
// Don't print the target-independent instructions.
if (II->Namespace == "TargetOpcode")
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 5fd5914..4e2138d 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -721,9 +721,9 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
}
static std::string
-getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
+getNameForFeatureBitset(ArrayRef<const Record *> FeatureBitset) {
std::string Name = "CEFBS";
- for (const auto &Feature : FeatureBitset)
+ for (const Record *Feature : FeatureBitset)
Name += ("_" + Feature->getName()).str();
return Name;
}
@@ -731,7 +731,7 @@ getNameForFeatureBitset(const std::vector<Record *> &FeatureBitset) {
void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
const CodeGenTarget &Target) {
const auto &All = SubtargetFeatureInfo::getAll(Records);
- std::map<Record *, SubtargetFeatureInfo, LessRecordByID> SubtargetFeatures;
+ SubtargetFeatureInfoMap SubtargetFeatures;
SubtargetFeatures.insert(All.begin(), All.end());
OS << "#if (defined(ENABLE_INSTR_PREDICATE_VERIFIER) && !defined(NDEBUG)) "
@@ -752,18 +752,19 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
SubtargetFeatureInfo::emitComputeAssemblerAvailableFeatures(
Target.getName(), "", "computeAvailableFeatures", SubtargetFeatures, OS);
- std::vector<std::vector<Record *>> FeatureBitsets;
+ std::vector<std::vector<const Record *>> FeatureBitsets;
for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
FeatureBitsets.emplace_back();
- for (Record *Predicate : Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+ for (const Record *Predicate :
+ Inst->TheDef->getValueAsListOfDefs("Predicates")) {
const auto &I = SubtargetFeatures.find(Predicate);
if (I != SubtargetFeatures.end())
FeatureBitsets.back().push_back(I->second.TheDef);
}
}
- llvm::sort(FeatureBitsets, [&](const std::vector<Record *> &A,
- const std::vector<Record *> &B) {
+ llvm::sort(FeatureBitsets, [&](const std::vector<const Record *> &A,
+ const std::vector<const Record *> &B) {
if (A.size() < B.size())
return true;
if (A.size() > B.size())
@@ -806,7 +807,8 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
OS << " CEFBS";
unsigned NumPredicates = 0;
- for (Record *Predicate : Inst->TheDef->getValueAsListOfDefs("Predicates")) {
+ for (const Record *Predicate :
+ Inst->TheDef->getValueAsListOfDefs("Predicates")) {
const auto &I = SubtargetFeatures.find(Predicate);
if (I != SubtargetFeatures.end()) {
OS << '_' << I->second.TheDef->getName();
@@ -890,7 +892,8 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
StringRef TargetName,
bool ExpandDefinition) {
- RecVec TIIPredicates = Records.getAllDerivedDefinitions("TIIPredicate");
+ ArrayRef<const Record *> TIIPredicates =
+ Records.getAllDerivedDefinitions("TIIPredicate");
if (TIIPredicates.empty())
return;
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 0fc930b..8952c8e 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -446,7 +446,7 @@ void X86FoldTablesEmitter::addEntryWithFlags(FoldTable &Table,
assert((IsManual || Table.find(RegInst) == Table.end()) &&
"Override entry unexpectedly");
X86FoldTableEntry Result = X86FoldTableEntry(RegInst, MemInst);
- Record *RegRec = RegInst->TheDef;
+ const Record *RegRec = RegInst->TheDef;
Result.NoReverse = S & TB_NO_REVERSE;
Result.NoForward = S & TB_NO_FORWARD;
Result.FoldLoad = S & TB_FOLDED_LOAD;
@@ -537,8 +537,8 @@ void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInst,
uint16_t S, bool IsManual,
bool IsBroadcast) {
- Record *RegRec = RegInst->TheDef;
- Record *MemRec = MemInst->TheDef;
+ const Record *RegRec = RegInst->TheDef;
+ const Record *MemRec = MemInst->TheDef;
unsigned MemOutSize = MemRec->getValueAsDag("OutOperandList")->getNumArgs();
unsigned RegOutSize = RegRec->getValueAsDag("OutOperandList")->getNumArgs();
unsigned MemInSize = MemRec->getValueAsDag("InOperandList")->getNumArgs();
diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
index fe8a3f5..acf9e7a 100644
--- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn
@@ -35,6 +35,7 @@ static_library("Target") {
sources = [
"ABI.cpp",
"AssertFrameRecognizer.cpp",
+ "CoreFileMemoryRanges.cpp",
"DynamicRegisterInfo.cpp",
"ExecutionContext.cpp",
"InstrumentationRuntime.cpp",
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 2fda091..aceb9d0 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -152,7 +152,7 @@ public:
/// This function adds a DENY entry.
void denyDialect(StringRef dialectNamespace) {
Entry::FilterFn filterFn = [=](Operation *op) {
- return op->getDialect()->getNamespace() == dialectNamespace;
+ return op->getName().getDialectNamespace() == dialectNamespace;
};
entries.push_back(Entry{filterFn, Entry::FilterType::DENY});
}
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt
index 12b4fc4..1ee105f 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Tosa/IR/CMakeLists.txt
@@ -3,8 +3,8 @@ add_mlir_doc(TosaOps TosaOps Dialects/ -gen-op-doc)
add_mlir_interface(TosaInterfaces)
set(LLVM_TARGET_DEFINITIONS TosaOps.td)
-mlir_tablegen(TosaAttributes.h.inc -gen-attrdef-decls)
-mlir_tablegen(TosaAttributes.cpp.inc -gen-attrdef-defs)
+mlir_tablegen(TosaAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=tosa)
+mlir_tablegen(TosaAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=tosa)
add_public_tablegen_target(MLIRTosaAttributesIncGen)
set(LLVM_TARGET_DEFINITIONS TosaDialectBytecode.td)
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index ab6daa3..63572f2 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -73,7 +73,6 @@ def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d"> {
let arguments = (ins
Tosa_Tensor4D:$input,
-
Tosa_IntArrayAttr2:$kernel,
Tosa_IntArrayAttr2:$stride,
Tosa_IntArrayAttr4:$pad,
@@ -102,9 +101,8 @@ def Tosa_Conv2DOp : Tosa_InferShapedTypeOp<"conv2d"> {
let arguments = (ins
Tosa_Tensor4D:$input,
- 4DTensorOf<[Tosa_Weight]>:$weight,
+ TosaTensorRankOf<[Tosa_Weight], [4]>:$weight,
Tosa_Tensor1D:$bias,
-
Tosa_IntArrayAttr4:$pad,
Tosa_IntArrayAttr2:$stride,
Tosa_IntArrayAttr2:$dilation,
@@ -132,9 +130,8 @@ def Tosa_Conv3DOp : Tosa_InferShapedTypeOp<"conv3d"> {
let arguments = (ins
Tosa_Tensor5D:$input,
- TensorRankOf<[Tosa_Weight], [5]>:$weight,
+ TosaTensorRankOf<[Tosa_Weight], [5]>:$weight,
Tosa_Tensor1D:$bias,
-
Tosa_IntArrayAttr6:$pad,
Tosa_IntArrayAttr3:$stride,
Tosa_IntArrayAttr3:$dilation,
@@ -163,9 +160,8 @@ def Tosa_DepthwiseConv2DOp : Tosa_InferShapedTypeOp<"depthwise_conv2d"> {
let arguments = (ins
Tosa_Tensor4D:$input,
- 4DTensorOf<[Tosa_Weight]>:$weight,
+ TosaTensorRankOf<[Tosa_Weight], [4]>:$weight,
Tosa_Tensor1D:$bias,
-
Tosa_IntArrayAttr4:$pad,
Tosa_IntArrayAttr2:$stride,
Tosa_IntArrayAttr2:$dilation,
@@ -232,7 +228,7 @@ def Tosa_FullyConnectedOp : Tosa_InferShapedTypeOp<"fully_connected"> {
let arguments = (ins
Tosa_Tensor2D:$input,
- 2DTensorOf<[Tosa_Weight]>:$weight,
+ TosaTensorRankOf<[Tosa_Weight], [2]>:$weight,
Tosa_Tensor1D:$bias,
OptionalAttr<Tosa_ConvOpQuantizationAttr>:$quantization_info
);
@@ -347,9 +343,8 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> {
let arguments = (ins
Tosa_Tensor4D:$input,
- 4DTensorOf<[Tosa_Weight]>:$filter,
+ TosaTensorRankOf<[Tosa_Weight], [4]>:$filter,
Tosa_Tensor1D:$bias,
-
Tosa_IntArrayAttr4:$out_pad,
Tosa_IntArrayAttr2:$stride,
Tosa_IntArrayAttrUpto4:$out_shape,
@@ -641,12 +636,12 @@ def Tosa_LogicalAndOp : Tosa_ElementwiseOp<"logical_and", [
}];
let arguments = (ins
- I1Tensor:$input1,
- I1Tensor:$input2
+ Tosa_I1Tensor:$input1,
+ Tosa_I1Tensor:$input2
);
let results = (outs
- I1Tensor:$z
+ Tosa_I1Tensor:$z
);
}
@@ -708,12 +703,12 @@ def Tosa_LogicalOrOp : Tosa_ElementwiseOp<"logical_or", [
}];
let arguments = (ins
- I1Tensor:$input1,
- I1Tensor:$input2
+ Tosa_I1Tensor:$input1,
+ Tosa_I1Tensor:$input2
);
let results = (outs
- I1Tensor:$z
+ Tosa_I1Tensor:$z
);
}
@@ -731,12 +726,12 @@ def Tosa_LogicalXorOp : Tosa_ElementwiseOp<"logical_xor", [
}];
let arguments = (ins
- I1Tensor:$input1,
- I1Tensor:$input2
+ Tosa_I1Tensor:$input1,
+ Tosa_I1Tensor:$input2
);
let results = (outs
- I1Tensor:$z
+ Tosa_I1Tensor:$z
);
}
@@ -1085,11 +1080,11 @@ def Tosa_LogicalNotOp : Tosa_ElementwiseOp<"logical_not",
}];
let arguments = (ins
- I1Tensor:$input1
+ Tosa_I1Tensor:$input1
);
let results = (outs
- I1Tensor:$output
+ Tosa_I1Tensor:$output
);
}
@@ -1208,7 +1203,7 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> {
}];
let arguments = (ins
- I1Tensor:$pred,
+ Tosa_I1Tensor:$pred,
Tosa_Tensor:$on_true,
Tosa_Tensor:$on_false
);
@@ -1249,7 +1244,7 @@ def Tosa_EqualOp : Tosa_ElementwiseOp<"equal", [
);
let results = (outs
- I1Tensor:$output
+ Tosa_I1Tensor:$output
);
let extraClassDeclaration = [{
@@ -1277,7 +1272,7 @@ def Tosa_GreaterOp : Tosa_ElementwiseOp<"greater", [SameOperandsElementType]> {
);
let results = (outs
- I1Tensor:$output
+ Tosa_I1Tensor:$output
);
let hasFolder = 1;
@@ -1300,7 +1295,7 @@ def Tosa_GreaterEqualOp : Tosa_ElementwiseOp<"greater_equal",
);
let results = (outs
- I1Tensor:$output
+ Tosa_I1Tensor:$output
);
let hasFolder = 1;
@@ -1721,7 +1716,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
let arguments = (ins
Tosa_Tensor:$input1,
- Tosa_Int32Or64Tensor:$perms
+ Tosa_Int32Tensor:$perms
);
let results = (
@@ -1729,7 +1724,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
);
let extraClassDeclaration = [{
- LogicalResult getConstantPerms(llvm::SmallVector<int64_t> &perms);
+ LogicalResult getConstantPerms(llvm::SmallVector<int32_t> &perms);
}];
let hasCanonicalizer = 1;
@@ -1755,7 +1750,7 @@ def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather"> {
let arguments = (ins
Tosa_Tensor3D:$values,
- 2DTensorOf<[Tosa_Int32]>:$indices
+ TosaTensorRankOf<[Tosa_Int32], [2]>:$indices
);
let results = (outs
@@ -1776,7 +1771,7 @@ def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter"> {
let arguments = (ins
Tosa_Tensor3D:$values_in,
- 2DTensorOf<[Tosa_Int32]>:$indices,
+ TosaTensorRankOf<[Tosa_Int32], [2]>:$indices,
Tosa_Tensor3D:$input
);
@@ -1947,10 +1942,11 @@ def Tosa_ConstOp : Tosa_Op<"const", [ConstantLike, Pure,
);
let results = (outs
- TensorOf<[AnyTypeOf<[Tosa_AnyNumber]>]>:$output
+ TosaTensorOf<[AnyTypeOf<[Tosa_AnyNumber]>]>:$output
);
let hasFolder = 1;
+ let hasVerifier = 1;
}
//===----------------------------------------------------------------------===//
@@ -2054,7 +2050,7 @@ def Tosa_IfOp : Tosa_Op<"cond_if",
}];
let arguments = (ins
- I1Tensor:$cond,
+ Tosa_I1Tensor:$cond,
Variadic<Tosa_Tensor>:$inputs
);
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 14fc9c7..c3a0128 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -83,57 +83,82 @@ def Tosa_Weight : AnyTypeOf<[Tosa_Int4, Tosa_Int8,
Tosa_QuantizedInt, AnyFloat]>;
//===----------------------------------------------------------------------===//
+// TOSA Tensor Conformance
+//===----------------------------------------------------------------------===//
+
+def HasNo0Dimensions : And<[
+ IsRankedTensorTypePred,
+ CPred<"::llvm::all_of(::llvm::cast<::mlir::RankedTensorType>($_self).getShape(), [](auto v) { return v != 0; })">]>;
+
+class TosaTensorOf<
+ list<Type> allowedTypes, string summary = "tosa-conformant tensor">
+ : TensorOf<allowedTypes, [Or<[HasNo0Dimensions, IsUnrankedTensorTypePred]>], summary>;
+
+class TosaRankedTensorOf<
+ list<Type> allowedTypes, list<Pred> preds = [], string summary = "tosa-conformant ranked tensor">
+ : RankedTensorOf<allowedTypes, !listconcat([HasNo0Dimensions], preds), summary>;
+
+class TosaUnrankedTensorOf<list<Type> allowedTypes, list<Pred> preds = [], string summary = "tosa-conformant unranked tensor">
+ : UnrankedTensorOf<allowedTypes, preds, summary>;
+
+class TosaTensorRankOf<list<Type> allowedTypes, list<int> ranks>
+ : TosaRankedTensorOf<allowedTypes,
+ [HasAnyRankOfPred<ranks>],
+ !interleave(!foreach(rank, ranks, rank # "D"), "/") # " tensor">;
+
+//===----------------------------------------------------------------------===//
// Tensor types
//===----------------------------------------------------------------------===//
-def Tosa_Int32Tensor : TensorOf<[Tosa_Int32]>;
-def Tosa_Int32Or64Tensor : TensorOf<[Tosa_Int32Or64]>;
+def Tosa_I1Tensor : TosaTensorOf<[I1]>;
+def Tosa_Int32Tensor : TosaTensorOf<[Tosa_Int32]>;
+def Tosa_Int32Or64Tensor :TosaTensorOf<[Tosa_Int32Or64]>;
-def Tosa_FloatTensor : TensorOf<[AnyFloat]>;
+def Tosa_FloatTensor : TosaTensorOf<[AnyFloat]>;
// Either ranked or unranked tensor of TOSA supported element types.
-def Tosa_Tensor : TensorOf<[Tosa_AnyNumber]>;
+def Tosa_Tensor : TosaTensorOf<[Tosa_AnyNumber]>;
// Must be ranked but no further constraints
-def Tosa_RankedTensor : RankedTensorOf<[Tosa_AnyNumber]>;
+def Tosa_RankedTensor : TosaRankedTensorOf<[Tosa_AnyNumber]>;
// Any tensor element type allowed in Tosa ops.
def Tosa_ElementType : Type<Or<[Tosa_Int.predicate, Tosa_QuantizedInt.predicate,
AnyFloat.predicate]>, "tosa.dtype">;
class Tosa_TensorOfOrNone<list<Type> allowedTypes, string description = ""> :
- AnyTypeOf<[TensorOf<allowedTypes>, NoneType], description>;
+ AnyTypeOf<[TosaTensorOf<allowedTypes>, NoneType], description>;
//===----------------------------------------------------------------------===//
// Tensor types with constrained ranks.
//===----------------------------------------------------------------------===//
// Rank-0 (scalar) tensor
-def Tosa_ScalarTensor : TensorRankOf<[Tosa_AnyNumber], [0]>;
+def Tosa_ScalarTensor : TosaTensorRankOf<[Tosa_AnyNumber], [0]>;
// We include unranked tensors as a supported type for all possible tosa
// Tensors as unranked does not guarantee invalid. If unranked tensors exist
// they should be shape propagate used Tosa's shape inference pass and verified
// to not include any remaining unranked tensors.
-def Tosa_UnrankedTensor : UnrankedTensorOf<[Tosa_AnyNumber]>;
+def Tosa_UnrankedTensor : TosaUnrankedTensorOf<[Tosa_AnyNumber]>;
-def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, 1DTensorOf<[Tosa_AnyNumber]>], "1-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, 2DTensorOf<[Tosa_AnyNumber]>], "2-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, 3DTensorOf<[Tosa_AnyNumber]>], "3-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, 4DTensorOf<[Tosa_AnyNumber]>], "4-d tensor", "::mlir::TensorType">;
-def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tensor", "::mlir::TensorType">;
+def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1]>], "1-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [2]>], "2-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [3]>], "3-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [4]>], "4-d tosa-conformant tensor", "::mlir::TensorType">;
+def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tosa-conformant tensor", "::mlir::TensorType">;
// Ranked tensors up to given rank.
def Tosa_Tensor1Dto4D : AnyTypeOf<[
- Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [1,2,3,4]>]>;
+ Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1,2,3,4]>]>;
def Tosa_Tensor1Dto6D : AnyTypeOf<[
- Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [1,2,3,4,5,6]>]>;
+ Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [1,2,3,4,5,6]>]>;
def Tosa_TensorUpto4D : AnyTypeOf<[
- Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4]>]>;
+ Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4]>]>;
def Tosa_Int32TensorUpto4D : AnyTypeOf<[
- Tosa_UnrankedTensor, TensorRankOf<[Tosa_Int32], [0,1,2,3,4]>]>;
+ Tosa_UnrankedTensor, TosaTensorRankOf<[Tosa_Int32], [0,1,2,3,4]>]>;
//===----------------------------------------------------------------------===//
// Generic scalar, vector, or tensor of a particular type.
@@ -142,7 +167,7 @@ def Tosa_Int32TensorUpto4D : AnyTypeOf<[
class Tosa_TypeLike<list<Type> types, string description = ""> : TypeConstraint<Or<[
AnyTypeOf<types>.predicate,
VectorOf<types>.predicate,
- TensorOf<types>.predicate]>,
+ TosaTensorOf<types>.predicate]>,
description>;
def Tosa_IntLike : Tosa_TypeLike<[Tosa_Int], "signless-integer-like">;
diff --git a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
index ef40b34..90fea1f 100644
--- a/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
+++ b/mlir/include/mlir/Dialect/Tosa/Utils/ConversionUtils.h
@@ -216,6 +216,19 @@ TosaOp CreateOpAndInferShape(PatternRewriter &rewriter, Location loc,
return CreateOpAndInferShape<TosaOp>(builder, resultTy, args...);
}
+// Apply an int32_t permutation to some input, that should be of the same
+// size as perms. Perms should contain some permutation of 0 - perms.size() - 1.
+template <typename T>
+SmallVector<T> applyTOSAPermutation(ArrayRef<T> input,
+ ArrayRef<int32_t> perms) {
+ SmallVector<T> permuted;
+ size_t N = input.size();
+ permuted.resize_for_overwrite(N);
+ for (size_t i = 0; i < N; i++)
+ permuted[i] = input[perms[i]];
+ return permuted;
+}
+
} // namespace tosa
} // namespace mlir
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 93e8b08..2992671 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -329,10 +329,9 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<ROCDL::ROCDLDialect>();
target.addIllegalDialect<gpu::GPUDialect>();
- target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
- LLVM::FCeilOp, LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp,
- LLVM::Log10Op, LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp,
- LLVM::SqrtOp>();
+ target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
+ LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
+ LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
// TODO: Remove once we support replacing non-root ops.
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index 77c3d2e..fe53b49 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -313,7 +313,7 @@ public:
// convolution operation.
// TODO(suderman): See if this can be efficiently folded - check whether
// the input is used anywhere else, if not fold the constant.
- SmallVector<int64_t> weightPerm;
+ SmallVector<int32_t> weightPerm;
for (int i = 1; i < resultTy.getRank(); i++)
weightPerm.push_back(i);
weightPerm.push_back(0);
@@ -321,7 +321,7 @@ public:
SmallVector<int64_t> newWeightShape;
for (auto dim : weightPerm)
newWeightShape.push_back(weightShape[dim]);
- auto weightPermAttr = rewriter.getI64TensorAttr(weightPerm);
+ auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
Value weightPermValue =
rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
Type newWeightTy =
@@ -337,7 +337,7 @@ public:
if (5 == inputTy.getRank()) {
// TODO(suderman): See if this can be efficiently folded - check whether
// the input is used anywhere else, if not fold the constant.
- SmallVector<int64_t> weightPerm;
+ SmallVector<int32_t> weightPerm;
for (int i = 1; i < resultTy.getRank(); i++)
weightPerm.push_back(i);
weightPerm.push_back(0);
@@ -345,7 +345,7 @@ public:
SmallVector<int64_t> newWeightShape;
for (auto dim : weightPerm)
newWeightShape.push_back(weightShape[dim]);
- auto weightPermAttr = rewriter.getI64TensorAttr(weightPerm);
+ auto weightPermAttr = rewriter.getI32TensorAttr(weightPerm);
Value weightPermValue =
rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
Type newWeightTy =
@@ -1040,22 +1040,25 @@ public:
LogicalResult matchAndRewrite(tosa::TransposeOp op,
PatternRewriter &rewriter) const final {
- SmallVector<int64_t> constantPerms;
+ SmallVector<int32_t> constantPerms;
if (failed(op.getConstantPerms(constantPerms)))
return failure();
Location loc = op.getLoc();
- // The verifier should have made sure we have a valid permutation tensor.
- assert(isPermutationVector(constantPerms) && "Expected valid permutation");
+ // The verifier should have made sure we have a valid TOSA permutation
+ // tensor. isPermutationVector doesn't actually check the TOSA perms we
+ // expect.
SmallVector<OpFoldResult> inputSizes =
tensor::getMixedSizes(rewriter, loc, op.getInput1());
auto permutedSizes =
- applyPermutation<OpFoldResult>(inputSizes, constantPerms);
+ applyTOSAPermutation<OpFoldResult>(inputSizes, constantPerms);
auto permutedInit = rewriter.create<tensor::EmptyOp>(
loc, permutedSizes, op.getInput1().getType().getElementType());
rewriter.replaceOpWithNewOp<linalg::TransposeOp>(
- op, op.getInput1(), permutedInit, constantPerms);
+ op, op.getInput1(), permutedInit,
+ llvm::to_vector(llvm::map_range(
+ constantPerms, [](int32_t v) -> int64_t { return v; })));
return success();
}
};
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index da9a93f..03876a7 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -88,7 +88,7 @@ struct ConsolidateTransposeOptimization
return rewriter.notifyMatchFailure(transposeOp,
"input must be transpose operation");
- SmallVector<int64_t> transposePerms, innerTransposePerms;
+ SmallVector<int32_t> transposePerms, innerTransposePerms;
if (transposeOp.getConstantPerms(transposePerms).failed())
return rewriter.notifyMatchFailure(transposeOp,
"transpose perms must be constant");
@@ -497,8 +497,10 @@ OpFoldResult AddOp::fold(FoldAdaptor adaptor) {
return {};
auto resultETy = resultTy.getElementType();
- auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
- auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+ auto lhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+ auto rhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
if (lhsTy == resultTy && isSplatZero(resultETy, rhsAttr))
return getInput1();
@@ -536,8 +538,10 @@ OpFoldResult IntDivOp::fold(FoldAdaptor adaptor) {
// IntDivOp inputs must be integer type, no need to check for quantized type
auto resultETy = resultTy.getElementType();
- auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
- auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+ auto lhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+ auto rhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
if (lhsAttr && lhsAttr.isSplat()) {
if (llvm::isa<IntegerType>(resultETy) &&
lhsAttr.getSplatValue<APInt>().isZero())
@@ -605,10 +609,13 @@ OpFoldResult MulOp::fold(FoldAdaptor adaptor) {
return {};
auto resultETy = resultTy.getElementType();
- auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
- auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+ auto lhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+ auto rhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
const int64_t shift = llvm::isa<IntegerType>(resultETy) ? getShift() : 0;
+
if (rhsTy == resultTy) {
if (isSplatZero(resultETy, lhsAttr))
return lhsAttr.resizeSplat(resultTy);
@@ -638,8 +645,10 @@ OpFoldResult SubOp::fold(FoldAdaptor adaptor) {
return {};
auto resultETy = resultTy.getElementType();
- auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
- auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+ auto lhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+ auto rhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
if (lhsTy == resultTy && isSplatZero(resultETy, rhsAttr))
return getInput1();
@@ -681,8 +690,10 @@ struct APIntFoldGreaterEqual {
OpFoldResult GreaterOp::fold(FoldAdaptor adaptor) {
auto resultTy = llvm::dyn_cast<RankedTensorType>(getType());
- auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
- auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+ auto lhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+ auto rhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
if (!lhsAttr || !rhsAttr)
return {};
@@ -693,8 +704,10 @@ OpFoldResult GreaterOp::fold(FoldAdaptor adaptor) {
OpFoldResult GreaterEqualOp::fold(FoldAdaptor adaptor) {
auto resultTy = llvm::dyn_cast<RankedTensorType>(getType());
- auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
- auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+ auto lhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+ auto rhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
if (!lhsAttr || !rhsAttr)
return {};
@@ -706,8 +719,10 @@ OpFoldResult GreaterEqualOp::fold(FoldAdaptor adaptor) {
OpFoldResult EqualOp::fold(FoldAdaptor adaptor) {
auto resultTy = llvm::dyn_cast<RankedTensorType>(getType());
- auto lhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
- auto rhsAttr = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
+ auto lhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1());
+ auto rhsAttr =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput2());
Value lhs = getInput1();
Value rhs = getInput2();
auto lhsTy = llvm::cast<ShapedType>(lhs.getType());
@@ -838,14 +853,16 @@ OpFoldResult ReshapeOp::fold(FoldAdaptor adaptor) {
return {};
// reshape(const(x)) -> const(reshape-attr(x))
- if (auto operand = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
+ if (auto operand =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
// Constants must have static shape.
if (!outputTy.hasStaticShape())
return {};
// Okay to duplicate splat constants.
if (operand.isSplat())
- return SplatElementsAttr::get(outputTy, operand.getSplatValue<Attribute>());
+ return SplatElementsAttr::get(outputTy,
+ operand.getSplatValue<Attribute>());
// Don't duplicate other constants.
if (!getInput1().hasOneUse())
@@ -905,7 +922,8 @@ OpFoldResult ReverseOp::fold(FoldAdaptor adaptor) {
auto operand = getInput();
auto operandTy = llvm::cast<ShapedType>(operand.getType());
auto axis = getAxis();
- auto operandAttr = llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput());
+ auto operandAttr =
+ llvm::dyn_cast_if_present<SplatElementsAttr>(adaptor.getInput());
if (operandAttr)
return operandAttr;
@@ -954,7 +972,8 @@ OpFoldResult tosa::SelectOp::fold(FoldAdaptor adaptor) {
if (getOnTrue() == getOnFalse())
return getOnTrue();
- auto predicate = llvm::dyn_cast_if_present<DenseIntElementsAttr>(adaptor.getPred());
+ auto predicate =
+ llvm::dyn_cast_if_present<DenseIntElementsAttr>(adaptor.getPred());
if (!predicate)
return {};
@@ -975,7 +994,8 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
auto resultTy = llvm::cast<ShapedType>(getType());
// Transposing splat values just means reshaping.
- if (auto input = llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
+ if (auto input =
+ llvm::dyn_cast_if_present<DenseElementsAttr>(adaptor.getInput1())) {
if (input.isSplat() && resultTy.hasStaticShape() &&
input.getType().getElementType() == resultTy.getElementType())
return input.reshape(resultTy);
@@ -986,11 +1006,11 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
return {};
// Transpose is not the identity transpose.
- SmallVector<int64_t> perms;
+ SmallVector<int32_t> perms;
if (getConstantPerms(perms).failed())
return {};
- if (!llvm::equal(llvm::seq<int64_t>(0, perms.size()), perms))
+ if (!llvm::equal(llvm::seq<int32_t>(0, perms.size()), perms))
return {};
return getInput1();
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index d93db1b..0d0241f 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -204,22 +204,6 @@ void mlir::tosa::printTypeOrAttr(OpAsmPrinter &p, Operation *op, TypeAttr type,
// TOSA Operator Verifiers.
//===----------------------------------------------------------------------===//
-static bool hasZeroDimension(ShapedType shapedType) {
- if (!shapedType.hasRank())
- return false;
-
- auto rank = shapedType.getRank();
-
- for (int i = 0; i < rank; i++) {
- if (shapedType.isDynamicDim(i))
- continue;
- if (shapedType.getDimSize(i) == 0)
- return true;
- }
-
- return false;
-}
-
template <typename T>
static LogicalResult verifyConvOp(T op) {
// All TOSA conv ops have an input() and weight().
@@ -236,10 +220,6 @@ static LogicalResult verifyConvOp(T op) {
return failure();
}
- if (hasZeroDimension(inputType))
- return op.emitOpError() << "tensor has a dimension with size zero. Each "
- "dimension of a tensor must have size >= 1";
-
auto inputEType = inputType.getElementType();
auto weightEType = weightType.getElementType();
@@ -262,6 +242,29 @@ static LogicalResult verifyConvOp(T op) {
"allowed for float type");
return failure();
}
+ return success();
+}
+
+LogicalResult tosa::ConstOp::verify() {
+
+ auto attrType = llvm::dyn_cast<TensorType>(getValueAttr().getType());
+ auto outputType = llvm::dyn_cast<TensorType>(getOutput().getType());
+
+ if (!attrType || !outputType) {
+ emitOpError("expected tensors for attr/result type");
+ return failure();
+ }
+
+ if (auto result = llvm::dyn_cast<mlir::quant::QuantizedType>(
+ outputType.getElementType())) {
+ if (result.getStorageType() == attrType.getElementType())
+ return success();
+ }
+
+ if (attrType.getElementType() != outputType.getElementType()) {
+ emitOpError("expected same attr/result element types");
+ return failure();
+ }
return success();
}
@@ -283,9 +286,6 @@ LogicalResult tosa::ArgMaxOp::verify() {
LogicalResult tosa::AvgPool2dOp::verify() {
auto inputType = llvm::cast<ShapedType>(getInput().getType());
- if (hasZeroDimension(inputType))
- return emitOpError() << "tensor has a dimension with size zero. Each "
- "dimension of a tensor must have size >= 1";
auto inputETy = inputType.getElementType();
auto resultETy = llvm::cast<ShapedType>(getType()).getElementType();
@@ -341,9 +341,9 @@ LogicalResult tosa::ClampOp::verify() {
if (inputETy != outputETy)
return emitOpError("input/output element types are incompatible.");
- // if input datatype is float, check that the two min/max_fp attributes share
- // the same type and that their type is either the same of the input's
- // datatype, or a float type whose bitwidth > input datatype bitwidth
+ // If input datatype is float, check that the two min/max_fp attributes
+ // share the same type and that their type is either the same of the input's
+ // datatype, or a float type whose bitwidth > input datatype bitwidth.
if (!inputETy.isInteger(dataTypeBitWidth)) {
if (((maxFpType != minFpType) ||
(maxFpType != inputETy && maxFpType.getIntOrFloatBitWidth() <=
@@ -383,7 +383,8 @@ static void buildConvOpWithQuantInfo(OpBuilder &builder, OperationState &result,
}
}
-/// Handles tosa.transpose_conv2d which has outpad and output shape attributes.
+/// Handles tosa.transpose_conv2d which has outpad and output shape
+/// attributes.
static void buildTransConvOpWithQuantInfo(
OpBuilder &builder, OperationState &result, Type outputType, Value input,
Value weight, Value bias, DenseI64ArrayAttr outpad,
@@ -420,9 +421,9 @@ static void buildFCOpWithQuantInfo(OpBuilder &builder, OperationState &result,
}
}
-/// The tosa.matmul op is also intended to be generated where a fully_connected
-/// op must be constructed where the weight is not a constant. In this case,
-/// the fully_connected op must be expressed using matmul.
+/// The tosa.matmul op is also intended to be generated where a
+/// fully_connected op must be constructed where the weight is not a constant.
+/// In this case, the fully_connected op must be expressed using matmul.
/// TODO: Add link to the leglization document explaining this.
static void buildMatMulOpWithQuantInfo(OpBuilder &builder,
OperationState &result, Type outputType,
@@ -457,9 +458,9 @@ static void buildMatMulOpWithQuantInfo(OpBuilder &builder,
}
}
-/// Both the tosa.avg_pool2d and unary ops use the same UnaruOpQuantizationAttr
-/// but avg_pool operator has its own builder as it has additional parameters
-/// not part of the unary ops.
+/// Both the tosa.avg_pool2d and unary ops use the same
+/// UnaruOpQuantizationAttr but avg_pool operator has its own builder as it
+/// has additional parameters not part of the unary ops.
static void
buildAvgPool2dOpWithQuantInfo(OpBuilder &builder, OperationState &result,
Type outputType, Value input,
@@ -526,8 +527,8 @@ static LogicalResult resolveBroadcastShape(const ValueShapeRange &operands,
for (int i = 0, e = operands.size(); i != e; ++i) {
auto shape = operands.getShape(i);
if (!shape.hasRank()) {
- // TODO(jennik): Update function to have better case handling for invalid
- // operands and for ranked tensors.
+ // TODO(jennik): Update function to have better case handling for
+ // invalid operands and for ranked tensors.
return failure();
}
outRank = std::max<int64_t>(outRank, shape.getRank());
@@ -776,8 +777,8 @@ LogicalResult tosa::PadOp::inferReturnTypeComponents(
return success();
}
- // If the input rank is unknown we can info the output rank using the padding
- // shape's first dim.
+ // If the input rank is unknown we can info the output rank using the
+ // padding shape's first dim.
if (!inputShape.hasRank()) {
if (paddingShape.isDynamicDim(0)) {
inferredReturnShapes.push_back(ShapedTypeComponents());
@@ -1000,10 +1001,6 @@ llvm::LogicalResult tosa::ReshapeOp::verify() {
TensorType inputType = getInput1().getType();
RankedTensorType outputType = getType();
- if (hasZeroDimension(inputType) || hasZeroDimension(outputType))
- return emitOpError() << "tensor has a dimension with size zero. Each "
- "dimension of a tensor must have size >= 1";
-
if ((int64_t)getNewShape().size() != outputType.getRank())
return emitOpError() << "new shape does not match result rank";
@@ -1034,16 +1031,15 @@ llvm::LogicalResult tosa::ReshapeOp::verify() {
return mlir::success();
}
-LogicalResult tosa::TransposeOp::getConstantPerms(SmallVector<int64_t> &perms) {
+LogicalResult tosa::TransposeOp::getConstantPerms(SmallVector<int32_t> &perms) {
// Perms must be constants.
DenseIntElementsAttr permsAttr;
if (!matchPattern(getPerms(), m_Constant(&permsAttr)))
return failure();
- // Transpose is not the identity transpose.
- perms = llvm::to_vector(
- llvm::map_range(permsAttr.getValues<APInt>(),
- [](const APInt &val) { return val.getSExtValue(); }));
+ perms.clear();
+ for (auto v : permsAttr.getValues<APInt>())
+ perms.push_back(v.getSExtValue());
return success();
}
@@ -1067,8 +1063,8 @@ LogicalResult tosa::TransposeOp::inferReturnTypeComponents(
return success();
}
- // This would imply the number of permutations does not match the rank of the
- // input which is illegal.
+ // This would imply the number of permutations does not match the rank of
+ // the input which is illegal.
if (permsShape.getDimSize(0) != inputShape.getRank()) {
return failure();
}
@@ -1154,19 +1150,38 @@ LogicalResult tosa::TransposeOp::verify() {
<< " (output rank) but got size "
<< permType.getDimSize(0);
- SmallVector<int64_t> constantPerms;
+ SmallVector<int32_t> constantPerms;
if (succeeded(getConstantPerms(constantPerms))) {
- // Assert that the permutation tensor has a rank, which means that the rank
- // has been verified above.
+ // Assert that the permutation tensor has a rank, which means that the
+ // rank has been verified above.
assert(permType.hasRank() &&
"Unexpectedly found permutation tensor without rank");
- if (!isPermutationVector(constantPerms))
+ if (!llvm::all_of(constantPerms,
+ [&constantPerms](int32_t s) {
+ return s >= 0 &&
+ static_cast<size_t>(s) < constantPerms.size();
+ }) ||
+ !isPermutationVector(llvm::to_vector(llvm::map_range(
+ constantPerms, [](int32_t v) -> int64_t { return v; }))))
return emitOpError() << "expected valid permutation tensor";
- if (inputType.hasRank() && !llvm::all_of(constantPerms, [&](int64_t s) {
- return s < inputType.getRank();
- })) {
- return emitOpError() << "permutation must be within input bounds";
+ // Verify that the types of the input and output tensors are properly
+ // permuted.
+ if (inputType.hasRank() && outputType.hasRank()) {
+ assert(constantPerms.size() == static_cast<size_t>(inputType.getRank()) &&
+ inputType.getRank() == outputType.getRank());
+
+ for (auto i = 0; i < outputType.getRank(); i++) {
+ if (inputType.isDynamicDim(constantPerms[i]) ||
+ outputType.isDynamicDim(i))
+ continue;
+
+ if (inputType.getDimSize(constantPerms[i]) != outputType.getDimSize(i))
+ return emitOpError()
+ << "expected output tensor dim " << i << " to match "
+ << "input dim " << constantPerms[i] << " with value of "
+ << inputType.getDimSize(constantPerms[i]);
+ }
}
}
return success();
@@ -1175,7 +1190,7 @@ LogicalResult tosa::TransposeOp::verify() {
LogicalResult TransposeOp::reifyResultShapes(
OpBuilder &builder, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {
- SmallVector<int64_t> transposePerms;
+ SmallVector<int32_t> transposePerms;
if (getConstantPerms(transposePerms).failed())
return failure();
@@ -1184,7 +1199,7 @@ LogicalResult TransposeOp::reifyResultShapes(
SmallVector<OpFoldResult> returnedDims(inputType.getRank());
for (auto dim : transposePerms) {
- int64_t dimInInput = transposePerms[dim];
+ int32_t dimInInput = transposePerms[dim];
if (inputType.isDynamicDim(dimInInput))
returnedDims[dim] =
builder.create<tensor::DimOp>(getLoc(), input, dimInInput)
@@ -1378,8 +1393,8 @@ static LogicalResult verifyReduceOp(T op) {
<< ")";
return failure();
}
- // We can only verify the reduced dimension size to be 1 if this is not the
- // special case of output rank == 0.
+ // We can only verify the reduced dimension size to be 1 if this is not
+ // the special case of output rank == 0.
if (outputRank != 0) {
auto outputShape = outputType.getShape();
if (!outputType.isDynamicDim(reduceAxis) &&
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index 39699ee..0d55d1899 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s
-// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s
-// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s
+// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s
+// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s
+// RUN: mlir-opt --verify-each --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s
// CHECK-LABEL: @matmul
func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
@@ -521,7 +521,7 @@ func.func @conv2d_scalar_bias_f32(%input: tensor<1x49x42x27xf32>, %weights: tens
// CHECK-LABEL: @conv2d_i8
func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () {
- // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
+ // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
// HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x1x1x27xi8>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<1x1x27x28xi8>) permutation = [1, 2, 3, 0]
// CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xi32>
// CHECK: %[[BROADCAST:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2 : tensor<28xi8>) outs(%[[INIT]] : tensor<1x45x40x28xi32>) {
@@ -542,7 +542,7 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
// CHECK-LABEL: @conv2d_f32
func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
- // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
+ // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi32>
// HWCF: %[[TRANSPOSE:.+]] = linalg.transpose ins(%arg1 : tensor<28x3x3x27xf32>) outs(%[[TRANSPOSEDINIT:.+]] : tensor<3x3x27x28xf32>) permutation = [1, 2, 3, 0]
// CHECK: %[[INIT:.+]] = tensor.empty() : tensor<1x45x40x28xf32>
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
index c2bbfd5..73da281 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
@@ -24,7 +24,7 @@ func.func @tensor_with_unknown_rank(%arg0: tensor<*xi8>) -> tensor<*xi8> {
// check that tosa verify kick in
func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> {
- // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+ // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x?x9xf32>'}}
%0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
: (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32>
return %0 : tensor<1x7x7x9xf32>
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index 8e19f87..2902c4a 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -80,14 +80,14 @@ func.func @transpose_fold_4d_int() -> tensor<3x1x4x2xi32> {
[[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]],
[[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]
]]> : tensor<1x2x3x4xi32>} : () -> tensor<1x2x3x4xi32>
- %perms = "tosa.const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi64>} : () -> tensor<4xi64>
+ %perms = "tosa.const"() {value = dense<[2, 0, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
// CHECK: %[[CST:.+]] = "tosa.const"() <{
// CHECK-SAME{LITERAL}: value = dense<[
// CHECK-SAME{LITERAL}: [[[0, 12], [1, 13], [2, 14], [3, 15]]],
// CHECK-SAME{LITERAL}: [[[4, 16], [5, 17], [6, 18], [7, 19]]],
// CHECK-SAME{LITERAL}: [[[8, 20], [9, 21], [10, 22], [11, 23]]]
// CHECK-SAME{LITERAL}: ]>
- %1 = tosa.transpose %input, %perms : (tensor<1x2x3x4xi32>, tensor<4xi64>) -> tensor<3x1x4x2xi32>
+ %1 = tosa.transpose %input, %perms : (tensor<1x2x3x4xi32>, tensor<4xi32>) -> tensor<3x1x4x2xi32>
// CHECK: return %[[CST]]
return %1 : tensor<3x1x4x2xi32>
}
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 418f768..414bcfe 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -1,6 +1,22 @@
// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate=strict-op-spec-alignment
+func.func @test_const() -> tensor<1xf32> {
+ // expected-error@+1{{'tosa.const' op expected same attr/result element types}}
+ %0 = "tosa.const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xf32>
+ return %0 : tensor<1xf32>
+}
+
+// -----
+
+func.func @test_const_non_tensor_attr() {
+ // expected-error@+1{{tosa.const' op expected tensors for attr/result type}}
+ %0 = "tosa.const"() {value = dense<1.0> : vector<f32>} : () -> tensor<f32>
+ return
+}
+
+// -----
+
func.func @test_conv2d(%arg0: tensor<1x29x29x4xf32>, %arg1: tensor<16x3x3x4xi8>, %arg2: tensor<16xi8>) -> tensor<1x27x27x16xi8> {
// expected-error@+1 {{expect both input and weight to be float or not together, got 'f32' and 'i8'}}
%0 = tosa.conv2d %arg0, %arg1, %arg2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
@@ -148,6 +164,42 @@ func.func @test_transpose_invalid_permutation_tensor(%arg0: tensor<13x21x3xf32>)
// -----
+func.func @test_transpose_invalid_permutation_negative(%arg0: tensor<3x2xi32>) -> tensor<*xi32> {
+ %perms = "tosa.const"() {value = dense<[-1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+ // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}}
+ %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32>
+ return %1 : tensor<*xi32>
+}
+
+// -----
+
+func.func @test_transpose_invalid_permutation_tensor_above_range(%arg0: tensor<3x2xi32>) -> tensor<*xi32> {
+ %perms = "tosa.const"() {value = dense<[2, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+ // expected-error@+1 {{'tosa.transpose' op expected valid permutation tensor}}
+ %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<*xi32>
+ return %1 : tensor<*xi32>
+}
+
+// -----
+
+func.func @test_transpose_invalid_permutation_types(%arg0: tensor<3x2xi32>) -> tensor<3x4xi32> {
+ %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+ // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 0 to match input dim 1 with value of 2}}
+ %1 = tosa.transpose %arg0, %perms : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+ return %1 : tensor<3x4xi32>
+}
+
+// -----
+
+func.func @test_transpose_invalid_permutation_types_dynamic_dim_ok(%arg0: tensor<2x?xi32>) -> tensor<3x4xi32> {
+ %perms = "tosa.const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+ // expected-error@+1 {{'tosa.transpose' op expected output tensor dim 1 to match input dim 0 with value of 2}}
+ %1 = tosa.transpose %arg0, %perms : (tensor<2x?xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+ return %1 : tensor<3x4xi32>
+}
+
+// -----
+
func.func @test_fully_connected_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<2x3xf32>) -> tensor<273x2xf32> {
%0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<2xf32>} : () -> tensor<2xf32>
%1 = tosa.reshape %arg0 {new_shape = array<i64: 273, 3>} : (tensor<13x21x3xf32>) -> tensor<273x3xf32>
@@ -269,7 +321,7 @@ func.func @test_reshape_type_mismatch(%arg0 : tensor<13x21x3xf32>) -> () {
// -----
func.func @test_reshape_static_zero_dim_input(%arg0 : tensor<13x0x3xf32>) -> () {
- // expected-error@+1 {{'tosa.reshape' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+ // expected-error@+1 {{'tosa.reshape' op operand #0 must be tosa-conformant tensor of number values, but got 'tensor<13x0x3xf32>'}}
%0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 21, 3>} : (tensor<13x0x3xf32>) -> tensor<13x0x3xf32>
return
}
@@ -277,7 +329,7 @@ func.func @test_reshape_static_zero_dim_input(%arg0 : tensor<13x0x3xf32>) -> ()
// -----
func.func @test_reshape_zero_dim_input(%arg0 : tensor<?x0x3xf32>) -> () {
- // expected-error@+1 {{'tosa.reshape' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+ // expected-error@+1 {{'tosa.reshape' op operand #0 must be tosa-conformant tensor of number values, but got 'tensor<?x0x3xf32>'}}
%0 = "tosa.reshape"(%arg0) {new_shape = array<i64: 13, 21, 3>} : (tensor<?x0x3xf32>) -> tensor<13x0x3xf32>
return
}
@@ -341,7 +393,7 @@ func.func @test_const_attribute_type_mismatch() -> tensor<100x100xf32> {
// -----
func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
- // expected-error@+1 {{'tosa.conv2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+ // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x29x0x4xf32>'}}
%0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
: (tensor<1x29x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
return %0 : tensor<1x27x27x16xf32>
@@ -350,8 +402,8 @@ func.func @test_conv2d_static_zero_dim_input(%arg0: tensor<1x29x0x4xf32>, %arg1:
// -----
func.func @test_conv2d_zero_dim_input(%arg0: tensor<1x?x0x4xf32>, %arg1: tensor<16x3x3x4xf32>, %arg2: tensor<16xf32>) -> tensor<1x27x27x16xf32> {
- // expected-error@+1 {{'tosa.conv2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
- %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+ // expected-error@+1 {{'tosa.conv2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x?x0x4xf32>'}}
+ %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
: (tensor<1x?x0x4xf32>, tensor<16x3x3x4xf32>, tensor<16xf32>) -> tensor<1x27x27x16xf32>
return %0 : tensor<1x27x27x16xf32>
}
@@ -360,7 +412,7 @@ func.func @test_conv2d_zero_dim_input(%arg0: tensor<1x?x0x4xf32>, %arg1: tensor<
// -----
func.func @test_avg_pool2d_static_zero_dim_input(%arg0: tensor<1x0x7x9xf32>) -> tensor<1x7x7x9xf32> {
- // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+ // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x7x9xf32>'}}
%0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
: (tensor<1x0x7x9xf32>) -> tensor<1x7x7x9xf32>
return %0 : tensor<1x7x7x9xf32>
@@ -369,7 +421,7 @@ func.func @test_avg_pool2d_static_zero_dim_input(%arg0: tensor<1x0x7x9xf32>) ->
// -----
func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> {
- // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+ // expected-error@+1 {{'tosa.avg_pool2d' op operand #0 must be 4-d tosa-conformant tensor, but got 'tensor<1x0x?x9xf32>'}}
%0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
: (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32>
return %0 : tensor<1x7x7x9xf32>
@@ -469,7 +521,7 @@ func.func @test_tile_io_rank_mismatch() {
// CHECK-LABEL: @test_invalid_constant_permutation
func.func @test_invalid_constant_permutation() {
- // expected-error@+3 {{permutation must be within input bounds}}
+ // expected-error@+3 {{'tosa.transpose' op expected valid permutation tensor}}
%0 = tensor.empty() : tensor<3x4x5xi32>
%1 = arith.constant dense<[3, 0, 1]> : tensor<3xi32>
%2 = tosa.transpose %0, %1 : (tensor<3x4x5xi32>, tensor<3xi32>) -> tensor<3x4x5xi32>
@@ -480,7 +532,7 @@ func.func @test_invalid_constant_permutation() {
// CHECK-LABEL: test_rank_size_constant_permutation
func.func @test_rank_size_constant_permutation() {
- // expected-error@+4 {{permutation must be within input bounds}}
+ // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}}
%0 = arith.constant 6 : index
%1 = arith.constant dense<[0, 2]> : tensor<2xi32>
%2 = tensor.empty(%0) : tensor<?x27xi64>
@@ -492,7 +544,7 @@ func.func @test_rank_size_constant_permutation() {
// CHECK-LABEL: test_large_constant_permutation
func.func @test_large_constant_permutation() {
- // expected-error@+4 {{permutation must be within input bounds}}
+ // expected-error@+4 {{'tosa.transpose' op expected valid permutation tensor}}
%0 = arith.constant 6 : index
%1 = arith.constant dense<[1185677355, 332462212]> : tensor<2xi32>
%2 = tensor.empty(%0) : tensor<?x27xi64>
@@ -504,7 +556,7 @@ func.func @test_large_constant_permutation() {
// CHECK-LABEL: test_table_rank0_table
func.func @test_table_rank0_table(%arg0: tensor<64xi16>, %arg1: tensor<i16>) {
- // expected-error@+1 {{'tosa.table' op operand #1 must be 1-d tensor, but got 'tensor<i16>'}}
+ // expected-error@+1 {{'tosa.table' op operand #1 must be 1-d tosa-conformant tensor, but got 'tensor<i16>'}}
%0 = tosa.table %arg0, %arg1 : (tensor<64xi16>, tensor<i16>) -> tensor<64xi16>
return
}
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 525ee91..a1600fd 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -574,6 +574,22 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
}
// -----
+// CHECK-LABEL: transpose_dynamic_dim
+func.func @test_transpose_dynamic_dim(%arg0: tensor<13x?x3xf32>) -> tensor<3x13x?xf32> {
+ %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+ %1 = tosa.transpose %arg0, %0 : (tensor<13x?x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32>
+ return %1 : tensor<3x13x?xf32>
+}
+
+// -----
+// CHECK-LABEL: transpose_half_dynamic_dim
+func.func @test_transpose_half_dynamic_dim(%arg0: tensor<13x3x3xf32>) -> tensor<3x13x?xf32> {
+ %0 = "tosa.const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+ %1 = tosa.transpose %arg0, %0 : (tensor<13x3x3xf32>, tensor<3xi32>) -> tensor<3x13x?xf32>
+ return %1 : tensor<3x13x?xf32>
+}
+
+// -----
// CHECK-LABEL: gather
func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xf32> {
%0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x26xi32>) -> tensor<13x26x3xf32>
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f3d3c74..b86fcac 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -273,6 +273,16 @@ libc_support_library(
)
libc_support_library(
+ name = "__support_macros_null_check",
+ hdrs = ["src/__support/macros/null_check.h"],
+ deps = [
+ ":__support_macros_config",
+ ":__support_macros_optimization",
+ ":__support_macros_sanitizer",
+ ],
+)
+
+libc_support_library(
name = "__support_common",
hdrs = [
"src/__support/common.h",
@@ -665,6 +675,7 @@ libc_support_library(
":__support_ctype_utils",
":__support_fputil_fp_bits",
":__support_fputil_rounding_mode",
+ ":__support_macros_null_check",
":__support_str_to_integer",
":__support_str_to_num_result",
":__support_uint128",